CUDA中Texture Memory的学习_平凡的世界

http://blog.sina.com.cn/u/2557742573

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

CUDA中Texture Memory的学习

(2014-03-26 16:39:04)

今天完整的阅读了CUDA C Programming Guide中关于Texture Memory的几个部分。

重要的地方做了翻译。

同时，对CUDA Samples中的SimpleTexture作了修改测试，使符合我自己最后需要的应用方式。

在SobelFilter例程中，使用到了texture ，准确来说是，texture reference(而

不是texture object，虽然两者区别并不是很大)。

为什么要使用texture是我看这一部分的主要原因，因为我考虑的是，host读取image后，copy到device端(存储在global memory中)。

那么kernel调用中就可以直接操作图像的各个像素了，为什么我需要一个

texture，它会更快么？而且在kernel中取一个像素点，还要进行Texture Fetch操作(我这

里是用Tex2D())。

所以搜索了一下，关于这个问题，stackoverflow上有人做了一些回答：

http://stackoverflow.com/questions/8306967/constant-memory-vs-texture-memory-vs-global-memory-in-cuda

http://forum.beyond3d.com/showthread.php?t=52510

简单总结是这样的：

Texture Memory:

Cache optimized for 2D spatial access pattern

Reads have some advantages like address modes and interpolation that can be used

at no extra cost

Global Memory:

Slow & uncached(1.0),cached(2.0)

Requires sequential & aligned 16 byte reads and writes to be fast (coalesced

read/write)

Texture memory针对2D访问优化，且带有cache，而且针对访问模式和插值都很有帮助。

Global memory没有cache。

但是，看起来，Texture 这些优势，除了cache之外，对于我的应用似乎并没有太

大的好处。纠结之余，还是把文档看完了，把例程修改了，测试了一把。其实也不能说是测

试，只是想看看，原始放进Texture的数据，读出来还是不是老样子。

文档中某些片段翻译记录如下，Texture Memory里面主要介绍Texture Object和Texture Reference，其他内容如Texture Gather 、Layered Texture就只是粗略看一下了。

3.2.10.1 Texture Memory中某段

read mode，可以设为cudaReadModeNormalizedFloat或者是cudaReadModeElementType.如果是前者，而且texel的类型为16bit或者8bit整型数，则由texture fetch函数返回的值，实际上是浮点数类型，而且无符号整型数的范围被映射到[0.0,1.0]，有符号数被映射到[-1.0,1.0]。例如，无符号8bit的元素使用0xff来读，被认为是1.但如果采用的是后一种类型，则不会做任何转换。

另外一段：

Whether texture coordinates are normalized or not. By default, textures are

referenced (by the functions of Texture Functions) using floating-point coordinates in

the range [0, N-1] where N is the size of the texture in the dimension corresponding

to the coordinate. For example, a texture that is 64x32 in size will be referenced with

coordinates in the range [0, 63] and [0, 31] for the x and y dimensions, respectively.

Normalized texture coordinates cause the coordinates to be specified in the range

[0.0, 1.0-1/N] instead of [0, N-1]。

3.2.10.1.2 Texture Reference API中某段

channelDesc describes the format of the texel; it must match the DataType argument of the texture reference declaration

还有关于texture 绑定数据的说明：

在内核可以使用texture reference从texture memory中读取数据之前，texture reference必须被绑定到一个texture上，使用cudaBindTexture()或者cudaBindTexture2D()绑定到线性存储器中，亦或者cudaBindTextureToArray()banding到CUDA array上。cudaUnbindTexture()用于解绑。建议分配二维的texture线性存储器时，使用cudaMallocPitch()，并且使用它返回的pitch值，作为参数给到cudaBindTexture2D()。

绑定一个texture到texture reference的的格式，必须与声明texture reference时候的格式一致，否则，texture fetch的结果会没有定义。

还有个比较重要的：

texture 和surface内存是有缓存的,而且在同一个kernel调用中，这份缓存不会与global内存的写操作保持一致，所以任何对已经由同一个kernel调用的global write写过后的地址，进行的读操作将会返回未定义的值。换句话说，一个线程可以安全的读取一些texture内存位置，当且仅当这个内存地址已经被以前的内核调用或者内存拷贝修改过，但是如果是被同一个内核调用中的同一个线程或者其它线程修改过，那就会失败。

测试代码，将CUDA的例程做了较大的修改，验证一下我的想法。

__global__ void transformKernel_integer(unsigned char *outputData,

int width,

int height

)

{

// calculate normalized texture coordinates

//计算归一化的纹理坐标，[0,1)

//感觉这个地方存在问题，blockIdx 和threadIdx均以0为开始，

//比如对于两者为0时候，则算得x=0，y=0;自然从tex中取得的数据就不行了

unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;

unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

//简单的将texture里面的数据读取出来。

outputData[y*width + x] = tex2D(tex_integer, x,y);// tu, tv);

}

void runTest_integer(int argc, char **argv)

{

int devID = findCudaDevice(argc, (const char **) argv);

// load image from disk

//无符号整形

unsigned char *hData = NULL;

unsigned int width, height;

char *imagePath = sdkFindFilePath(imageFilename, argv[0]);

if (imagePath == NULL)

{

printf("Unable to source image file: %s\n", imageFilename);

exit(EXIT_FAILURE);

}

sdkLoadPGM(imagePath, &hData, &width, &height);

//采用无符号整形

unsigned int size = width * height * sizeof(unsigned char);

printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);

// Load reference image from image (output)

//采用无符号整形

unsigned char *hDataRef = (unsigned char *) malloc(size);

char *refPath = sdkFindFilePath(refFilename, argv[0]);

if (refPath == NULL)

{

printf("Unable to find reference image file: %s\n", refFilename);

exit(EXIT_FAILURE);

}

sdkLoadPGM(refPath, &hDataRef, &width, &height);

// Allocate device memory for result

//采用无符号整形

unsigned char *dData = NULL;

checkCudaErrors(cudaMalloc((void **) &dData, size));

// Allocate array and copy image data

//采用无符号整形

//必须与texture reference的DataType相同

//而且我们采用了unsigned char 的话，只需要8bit

cudaChannelFormatDesc channelDesc =

cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsigned);

cudaArray *cuArray;

checkCudaErrors(cudaMallocArray(&cuArray,

&channelDesc,

width,

height));

checkCudaErrors(cudaMemcpyToArray(cuArray,

hData,

size,

cudaMemcpyHostToDevice));

// Set texture parameters

//设置为wrap模式，一旦越界就为0

tex_integer.addressMode[0] = cudaAddressModeWrap;

tex_integer.addressMode[1] = cudaAddressModeWrap;

tex_integer.addressMode[2] = cudaAddressModeWrap;

//采用point模式，取最近值，而非二维插值

tex_integer.filterMode = cudaFilterModePoint;

//使用非归一化的方式，

tex_integer.normalized = false;

// Bind the array to the texture

checkCudaErrors(cudaBindTextureToArray(tex_integer, cuArray, channelDesc));

dim3 dimBlock(16, 16, 1);

//CUDA 源代码里面，这个地方是错误的，修正如下

// dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);

dim3 dimGrid(( width + dimBlock.x - 1) / dimBlock.x,

( height + dimBlock.y -1) / dimBlock.y, 1);

// Warmup

transformKernel_integer<<>>(dData, width, height);

checkCudaErrors(cudaDeviceSynchronize());

StopWatchInterface *timer = NULL;

sdkCreateTimer(&timer);

sdkStartTimer(&timer);

// Execute the kernel

transformKernel_integer<<>>(dData, width, height);

// Check if kernel execution generated an error

getLastCudaError("Kernel execution failed");

checkCudaErrors(cudaDeviceSynchronize());

sdkStopTimer(&timer);

printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));

printf("%.2f Mpixels/sec\n",

(width *height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);

sdkDeleteTimer(&timer);

// Allocate mem for the result on host side

unsigned char *hOutputData = (unsigned char *) malloc(size);

// copy result from device to host

checkCudaErrors(cudaMemcpy(hOutputData,

dData,

size,

cudaMemcpyDeviceToHost));

// Write result to file

char outputFilename[1024];

strcpy(outputFilename, imagePath);

strcpy(outputFilename + strlen(imagePath) - 4, "_integer_out.pgm");

sdkSavePGM(outputFilename, hOutputData, width, height);

printf("Wrote '%s'\n", outputFilename);

// Write regression file if necessary

if (checkCmdLineFlag(argc, (const char **) argv, "regression"))

{

// Write file for regression test

//采用无符号整形

sdkWriteFile("./data/regression.dat",

hOutputData,

width*height,

0.0f,

false);

}

else

{

// We need to reload the data from disk,

// because it is inverted upon output

sdkLoadPGM(outputFilename, &hOutputData, &width, &height);

printf("Comparing files\n");

printf("\toutput:

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：CUDA并行运算，block ，thread计算测试【续-优化算法】

后一篇：CUDA 中cudaDeviceSynchronize()同步，异步，timer

新浪BLOG意见反馈留言板　欢迎批评指正