forked from jaredhoberock/cuda_launch_config
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsaxpy.cu
55 lines (40 loc) · 1.43 KB
/
saxpy.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#include "cuda_launch_config.hpp"
#include <thrust/device_vector.h>
#include <thrust/logical.h>
#include <thrust/functional.h>
#include <cassert>
__global__ void saxpy(float a, float *x, float *y, size_t n)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < n)
{
y[i] = a * x[i] + y[i];
}
}
int main()
{
size_t n = 1 << 20;
thrust::device_vector<float> x(n, 10);
thrust::device_vector<float> y(n, 100);
float a = 10;
// we'd like to launch saxpy, but we're not sure which block size to use
// let's use a heuristic which promotes occupancy.
// first, get the cudaFuncAtttributes object corresponding to saxpy
cudaFuncAttributes attributes;
cudaFuncGetAttributes(&attributes, saxpy);
// next, get the cudaDeviceProp object corresponding to the current device
int device;
cudaGetDevice(&device);
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, device);
// we can combine the two to compute a block size
size_t num_threads = block_size_with_maximum_potential_occupancy(attributes, properties);
// compute the number of blocks of size num_threads to launch
size_t num_blocks = n / num_threads;
// check for partial block at the end
if(n % num_threads) ++num_blocks;
saxpy<<<num_blocks,num_threads>>>(a, raw_pointer_cast(x.data()), raw_pointer_cast(y.data()), n);
// validate the result
assert(thrust::all_of(y.begin(), y.end(), thrust::placeholders::_1 == 200));
return 0;
}