1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
#include "../common/book.h"
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
__global__ void dot(float *a, float *b, float *c) {
__shared__ float cache[threadsPerBlock]; ////共享内存缓存区(驻留在物理GPU上),编译器为每一个线程块生成一个共享变量的副本。同一线程块中的每个线程共享这块内存。
int tid = threadIdx.x + blockIdx.x * blockDim.x; ////索引偏置
int cacheIndex = threadIdx.x; ////由于每一个线程块都具有一个共享内存的副本,故共享内存的索引就是该线程块上的线程索引。
float temp = 0;
while (tid < N) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp; ////每个线程处理的数据,相加放在对应的共享内存区域中。
// synchronize threads in this block
__syncthreads(); ////线程同步,当所有的线程都执行完以上操作时,才能继续执行下一步。
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
////归约,将共享内存区域每一个储存的值相加起来,由于规约每次迭代数量减半,要求 threadsPerBlock 是2 的指数倍。
int i = blockDim.x / 2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads(); ////线程同步。同样地,执行下一次规约迭代前,必须确保所有线程都已经执行完上面相加的操作。
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0]; ////把一个线程块中的最后计算得到的相加值返还给全局变量。
}
int main(void) {
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the cpu side
a = (float*)malloc(N * sizeof(float));
b = (float*)malloc(N * sizeof(float));
partial_c = (float*)malloc(blocksPerGrid * sizeof(float)); ////该变量的内存大小是线程块的数目* float内存大小。因为在GPU中,每个线程块最后返还一个相加值。
// allocate the memory on the GPU
HANDLE_ERROR(cudaMalloc((void**)&dev_a,
N * sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_b,
N * sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
blocksPerGrid * sizeof(float)));
// fill in the host memory with data
for (int i = 0; i<N; i++) {
a[i] = i;
b[i] = i * 2;
}
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR(cudaMemcpy(dev_a, a, N * sizeof(float),
cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(dev_b, b, N * sizeof(float),
cudaMemcpyHostToDevice));
dot << <blocksPerGrid, threadsPerBlock >> >(dev_a, dev_b,
dev_partial_c);
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
blocksPerGrid * sizeof(float),
cudaMemcpyDeviceToHost));
// finish up on the CPU side
c = 0;
for (int i = 0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
////cpu求点积
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
printf("Does GPU value %.6g = %.6g?\n", c,
2 * sum_squares((float)(N - 1)));
// free memory on the gpu side
HANDLE_ERROR(cudaFree(dev_a));
HANDLE_ERROR(cudaFree(dev_b));
HANDLE_ERROR(cudaFree(dev_partial_c));
// free memory on the cpu side
free(a);
free(b);
free(partial_c);
}
|