1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <math.h>
#include <helper_cuda.h>
using namespace std;
#define N 100
__global__ void add_kernel(double *a, double *b, double *c) {
int tid = blockIdx.x;
if (tid < N)
{
c[tid] = a[tid] + b[tid];
}
}
__global__ void value_init_kernel(double *a, double *b) {
int tid = blockIdx.x;
if (tid < N)
{
a[tid] = 1.0*tid;
b[tid] = (1.0*tid*tid);
}
}
int main(void)
{
cudaError_t err1 = cudaSuccess, err2 = cudaSuccess, err3 = cudaSuccess;
double a[N], b[N], c[N];
double *dev_a, *dev_b, *dev_c;
err1 = cudaMalloc((void**)&dev_a, N * sizeof(double));
err2 = cudaMalloc((void**)&dev_b, N * sizeof(double));
err3 = cudaMalloc((void**)&dev_c, N * sizeof(double));
if (err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess)
{
fprintf(stderr, "Failed to allocate device value (error code (%s,%s,%s))!\n", cudaGetErrorString(err1), cudaGetErrorString(err2), cudaGetErrorString(err3));
exit(EXIT_FAILURE);
}
value_init_kernel <<<N, 1 >>> (dev_a, dev_b);////在GPU上赋值操作
add_kernel <<<N, 1 >>> (dev_a, dev_b, dev_c);////在GPU上相加操作
err1 = cudaMemcpy(a, dev_a, N * sizeof(double), cudaMemcpyDeviceToHost);
err2 = cudaMemcpy(b, dev_b, N * sizeof(double), cudaMemcpyDeviceToHost);
err3 = cudaMemcpy(c, dev_c, N * sizeof(double), cudaMemcpyDeviceToHost);
if (err1 != cudaSuccess || err2 != cudaSuccess || err3 != cudaSuccess)
{
fprintf(stderr, "Failed to copy device value to host value (error code (%s,%s,%s))!\n", cudaGetErrorString(err1), cudaGetErrorString(err2), cudaGetErrorString(err3));
exit(EXIT_FAILURE);
}
for (int i = 0; i < N; i++)
{
printf("%f + %f = %f\n", a[i], b[i], c[i]);
}
////释放GPU内存
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
|