如何将在cpu运行的程序改成在cpu/gpu上运行的程序_大霞上仙

http://blog.sina.com.cn/u/6319164373

首页博文目录关于我

个人资料

微博

加好友发纸条

写留言加关注

博客等级：
博客积分：

博客访问：
关注人气：
获赠金笔：0支
赠出金笔：0支
荣誉徽章：

正文字体大小：大中小

如何将在cpu运行的程序改成在cpu/gpu上运行的程序

(2018-04-18 14:20:48)

标签：

cuda

分类：并行编程：GPU/MPI/OPEN***

1.gpu上创建内存

2.思考创建多少thread、block、grid及它们的维数(size_t ,dim(x,y,z))

3.修改在gpu上运行的程序__global__ void fun() (因线程减少了for循环)

4.执行在gpu上运行的程序 fun<<>>()

5.host与device的同步

6.释放gpu上的内存

例子：//原在CPU上运行的向量相乘的程序

#include < stdio.h >

void initWith(float num, float *a, int N)

{

for(int i = 0; i < N; ++i)

{

a[i] = num;

}

void addVectorsInto(float *result, float *a, float *b, int N)

{

for(int i = 0; i < N; ++i)

{

result[i] = a[i] + b[i];

}

void checkElementsAre(float target, float *array, int N)

{

for(int i = 0; i < N; i++)

{

if(array[i] != target)

{

printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);

exit(1);

}

printf("SUCCESS! All values added correctly.\n");

}

int main()

{

const int N = 2<<20;

size_t size = N * sizeof(float);

float *a;

float *b;

float *c;

a = (float *)malloc(size);

b = (float *)malloc(size);

c = (float *)malloc(size);

initWith(3, a, N);

initWith(4, b, N);

initWith(0, c, N);

addVectorsInto(c, a, b, N);

checkElementsAre(7, c, N);

free(a);

free(b);

free(c);

}

-----------------------------------------------------------------------------------

//在GPU上运行的向量相乘的程序

#include < stdio.h >

#include < assert.h >

inline cudaError_t checkCuda(cudaError_t result)

{

if (result != cudaSuccess) {

fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));

assert(result == cudaSuccess);

}

return result;

}

void initWith(float num, float *a, int N)

{

for(int i = 0; i < N; ++i)

{

a[i] = num;

}

__global__

void addVectorsInto(float *result, float *a, float *b, int N)

{

int index = threadIdx.x + blockIdx.x * blockDim.x;

int stride = blockDim.x * gridDim.x;

for(int i = index; i < N; i += stride)

{

result[i] = a[i] + b[i];

}

void checkElementsAre(float target, float *array, int N)

{

for(int i = 0; i < N; i++)

{

if(array[i] != target)

{

printf("FAIL: array[%d] - %0.0f does not equal %0.0f\n", i, array[i], target);

exit(1);

}

int main()

{

const int N = 2<<20;

size_t size = N * sizeof(float);

float *a;

float *b;

float *c;

checkCuda( cudaMallocManaged(&a, size) );

checkCuda( cudaMallocManaged(&b, size) );

checkCuda( cudaMallocManaged(&c, size) );

initWith(3, a, N);

initWith(4, b, N);

initWith(0, c, N);

size_t threadsPerBlock;

size_t numberOfBlocks;

threadsPerBlock = 256;

numberOfBlocks = (N + threadsPerBlock - 1) / threadsPerBlock;

addVectorsInto<<>>(c, a, b, N);

checkCuda( cudaGetLastError() );

checkCuda( cudaDeviceSynchronize() );

checkElementsAre(7, c, N);

checkCuda( cudaFree(a) );

checkCuda( cudaFree(b) );

checkCuda( cudaFree(c) );

}

**************************************************************************************

//在GPU上运行的矩阵相乘的程序

#include < stdio.h >

#define N 64

__global__ void matrixMulGPU( int * a, int * b, int * c )

{

int val = 0;

int row = blockIdx.x * blockDim.x + threadIdx.x;

int col = blockIdx.y * blockDim.y + threadIdx.y;

if (row < N && col < N)

{

for ( int k = 0; k < N; ++k )

val += a[row * N + k] * b[k * N + col];

c[row * N + col] = val;

}

void matrixMulCPU( int * a, int * b, int * c )

{

int val = 0;

for( int row = 0; row < N; ++row )

for( int col = 0; col < N; ++col )

{

val = 0;

for ( int k = 0; k < N; ++k )

val += a[row * N + k] * b[k * N + col];

c[row * N + col] = val;

}

int main()

{

int *a, *b, *c_cpu, *c_gpu;

int size = N * N * sizeof (int); // Number of bytes of an N x N matrix

// Allocate memory

cudaMallocManaged (&a, size);

cudaMallocManaged (&b, size);

cudaMallocManaged (&c_cpu, size);

cudaMallocManaged (&c_gpu, size);

// Initialize memory

for( int row = 0; row < N; ++row )

for( int col = 0; col < N; ++col )

{

a[row*N + col] = row;

b[row*N + col] = col+2;

c_cpu[row*N + col] = 0;

c_gpu[row*N + col] = 0;

}

dim3 threads_per_block (16, 16, 1); // A 16 x 16 block threads

dim3 number_of_blocks ((N / threads_per_block.x) + 1, (N / threads_per_block.y) + 1, 1);

matrixMulGPU <<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );

cudaDeviceSynchronize(); // Wait for the GPU to finish before proceeding

// Call the CPU version to check our work

matrixMulCPU( a, b, c_cpu );

// Compare the two answers to make sure they are equal

bool error = false;

for( int row = 0; row < N && !error; ++row )

for( int col = 0; col < N && !error; ++col )

if (c_cpu[row * N + col] != c_gpu[row * N + col])

{

printf("FOUND ERROR at c[%d][%d]\n", row, col);

error = true;

break;

}

if (!error)

printf("Success!\n");

// Free all our allocated memory

cudaFree(a); cudaFree(b);

cudaFree( c_cpu ); cudaFree( c_gpu );

}

阅读┊ 收藏 ┊ 喜欢 ▼ ┊打印┊举报/Report

前一篇：时间复杂度/空间复杂度

后一篇：docker介绍

新浪BLOG意见反馈留言板　欢迎批评指正