如何将在cpu运行的程序改成在cpu/gpu上运行的程序
(2018-04-18 14:20:48)
标签:
cuda |
分类: 并行编程:GPU/MPI/OPEN*** |
1.gpu上创建内存
for(int i = 0; i < N;
++i)
{
a[i] =
num;
}
for(int i = 0; i < N;
++i)
{
result[i]
= a[i] + b[i];
}
for(int i = 0; i < N;
i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal
%0.0f\n", i, array[i], target);
exit(1);
}
}
printf("SUCCESS! All values
added correctly.\n");
const int N =
2<<20;
size_t size = N *
sizeof(float);
float *a;
float *b;
float *c;
a = (float
*)malloc(size);
b = (float
*)malloc(size);
c = (float
*)malloc(size);
initWith(3, a,
N);
initWith(4, b,
N);
initWith(0, c,
N);
addVectorsInto(c, a, b,
N);
checkElementsAre(7, c,
N);
free(a);
free(b);
free(c);
if (result != cudaSuccess)
{
fprintf(stderr, "CUDA Runtime Error: %s\n",
cudaGetErrorString(result));
assert(result == cudaSuccess);
}
return
result;
for(int i = 0; i < N;
++i)
{
a[i] =
num;
}
int
index = threadIdx.x + blockIdx.x *
blockDim.x;
int stride = blockDim.x
* gridDim.x;
for(int i = index; i
< N; i += stride)
{
result[i] = a[i] + b[i];
}
for(int i = 0; i < N;
i++)
{
if(array[i] != target)
{
printf("FAIL: array[%d] - %0.0f does not equal
%0.0f\n", i, array[i], target);
exit(1);
}
}
const int N =
2<<20;
size_t size = N *
sizeof(float);
float *a;
float *b;
float *c;
checkCuda( cudaMallocManaged(&a, size)
);
checkCuda(
cudaMallocManaged(&b, size) );
checkCuda(
cudaMallocManaged(&c, size) );
initWith(3, a,
N);
initWith(4, b,
N);
initWith(0, c,
N);
size_t
threadsPerBlock;
size_t
numberOfBlocks;
threadsPerBlock = 256;
numberOfBlocks = (N +
threadsPerBlock - 1) / threadsPerBlock;
addVectorsInto<<>>(c, a, b,
N);
checkCuda( cudaGetLastError()
);
checkCuda(
cudaDeviceSynchronize() );
checkElementsAre(7, c,
N);
checkCuda( cudaFree(a) );
checkCuda( cudaFree(b)
);
checkCuda( cudaFree(c)
);
int val =
0;
int row = blockIdx.x * blockDim.x +
threadIdx.x;
int col = blockIdx.y * blockDim.y +
threadIdx.y;
if (row < N && col <
N)
{
for ( int k = 0;
k < N; ++k )
val += a[row * N + k] * b[k * N +
col];
c[row * N + col]
= val;
}
int val =
0;
for( int row
= 0; row < N; ++row )
for( int col = 0; col < N; ++col
)
{
val = 0;
for ( int k = 0; k < N;
++k )
val +=
a[row * N + k] * b[k * N + col];
c[row * N + col] =
val;
}
int *a, *b,
*c_cpu, *c_gpu;
int size = N
* N * sizeof (int); // Number of bytes of an N x N
matrix
// Allocate
memory
cudaMallocManaged (&a,
size);
cudaMallocManaged (&b, size);
cudaMallocManaged (&c_cpu, size);
cudaMallocManaged (&c_gpu, size);
//
Initialize memory
for( int row
= 0; row < N; ++row )
for( int col = 0; col < N; ++col
)
{
a[row*N + col] =
row;
b[row*N + col] =
col+2;
c_cpu[row*N + col] =
0;
c_gpu[row*N + col] =
0;
}
dim3 threads_per_block (16, 16, 1); // A
16 x 16 block threads
dim3 number_of_blocks ((N /
threads_per_block.x) + 1, (N / threads_per_block.y) + 1,
1);
matrixMulGPU <<<
number_of_blocks, threads_per_block >>> ( a, b, c_gpu
);
cudaDeviceSynchronize(); //
Wait for the GPU to finish before proceeding
// Call the
CPU version to check our work
matrixMulCPU( a, b, c_cpu );
// Compare
the two answers to make sure they are equal
bool error =
false;
for( int row
= 0; row < N && !error; ++row )
for( int col = 0; col < N && !error;
++col )
if (c_cpu[row * N + col] !=
c_gpu[row * N + col])
{
printf("FOUND ERROR at c[%d][%d]\n", row, col);
error =
true;
break;
}
if
(!error)
printf("Success!\n");
// Free all
our allocated memory
cudaFree(a);
cudaFree(b);
cudaFree(
c_cpu ); cudaFree( c_gpu );
2.思考创建多少thread、block、grid及它们的维数(size_t
,dim(x,y,z))
3.修改在gpu上运行的程序__global__
void fun() (因线程减少了for循环)
4.执行在gpu上运行的程序
fun<<>>()
5.host与device的同步
6.释放gpu上的内存
例子://原在CPU上运行的向量相乘的程序
#include < stdio.h
>
void initWith(float num, float *a, int
N)
{
}
void addVectorsInto(float *result, float *a,
float *b, int N)
{
}
void checkElementsAre(float target, float
*array, int N)
{
}
int main()
{
}
-----------------------------------------------------------------------------------
//在GPU上运行的向量相乘的程序
#include < stdio.h
>
#include <
assert.h
>
inline cudaError_t checkCuda(cudaError_t
result)
{
}
void initWith(float num, float *a, int
N)
{
}
__global__
void addVectorsInto(float *result, float *a,
float *b, int N)
{
}
void checkElementsAre(float target, float
*array, int N)
{
}
int main()
{
}
**************************************************************************************
//在GPU上运行的矩阵相乘的程序
#include <
stdio.h >
#define N
64
__global__ void matrixMulGPU( int * a, int *
b, int * c )
{
}
void matrixMulCPU( int * a, int
* b, int * c )
{
}
int main()
{
}
前一篇:时间复杂度/空间复杂度
后一篇:docker介绍