-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathothello_kernel.cu
108 lines (81 loc) · 2.55 KB
/
othello_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#include <stdio.h>
#define N 10
#define BLOCKSIZE 10
void minmaxCuda(double *max, double *min, double *a, float &time);
__global__ void minmaxKernel(double *max, double *min, double *a) {
__shared__ double maxtile[BLOCKSIZE];
__shared__ double mintile[BLOCKSIZE];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
maxtile[tid] = a[i];
mintile[tid] = a[i];
__syncthreads();
// strided index and non-divergent branch
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
if (maxtile[tid + s] > maxtile[tid])
maxtile[tid] = maxtile[tid + s];
if (mintile[tid + s] < mintile[tid])
mintile[tid] = mintile[tid + s];
}
__syncthreads();
}
if (tid == 0) {
max[blockIdx.x] = maxtile[0];
min[blockIdx.x] = mintile[0];
}
}
__global__ void finalminmaxKernel(double *max, double *min) {
__shared__ double maxtile[BLOCKSIZE];
__shared__ double mintile[BLOCKSIZE];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
maxtile[tid] = max[i];
mintile[tid] = min[i];
__syncthreads();
// strided index and non-divergent branch
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
if (maxtile[tid + s] > maxtile[tid])
maxtile[tid] = maxtile[tid + s];
if (mintile[tid + s] < mintile[tid])
mintile[tid] = mintile[tid + s];
}
__syncthreads();
}
if (tid == 0) {
max[blockIdx.x] = maxtile[0];
min[blockIdx.x] = mintile[0];
}
}
void minmaxCuda(double *max, double *min, double *a, float &time)
{
double *dev_a = 0;
double *dev_max = 0;
double *dev_min = 0;
float milliseconds = 0;
dim3 dimBlock(BLOCKSIZE);
dim3 dimGrid(N);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaMalloc((void**)&dev_max, N * sizeof(double));
cudaMalloc((void**)&dev_min, N * sizeof(double));
cudaMalloc((void**)&dev_a, N * N * sizeof(double));
cudaMemcpy(dev_a, a, N * N * sizeof(double), cudaMemcpyHostToDevice);
cudaEventRecord(start);
minmaxKernel<<<dimGrid, dimBlock>>>(dev_max, dev_min, dev_a);
finalminmaxKernel<<<1, dimBlock>>>(dev_max, dev_min);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaDeviceSynchronize();
cudaMemcpy(max, dev_max, N * sizeof(double), cudaMemcpyDeviceToHost);
cudaMemcpy(min, dev_min, N * sizeof(double), cudaMemcpyDeviceToHost);
cudaEventElapsedTime(&milliseconds, start, stop);
time = milliseconds;
cudaFree(dev_max);
cudaFree(dev_min);
cudaFree(dev_a);
}