cuda例子
因为我们装cuda到linux机器后, 会附带cuda工具nsight (并且之前装好了jdk 8)
首先用命令行启动nsight, 建立项目1
2
3dassein@pad:~$ nsight
Java HotSpot(TM) 64-Bit Server VM warning: ignoring option MaxPermSize=256m; support was removed in 8.0
CompilerOracle: exclude java/lang/reflect/Array.newInstance
粘贴例子代码如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156/*
矢量求和运算
a[i]+b[i]=c[i];
*/
//CUDA的头文件
//C语言的头文件
//宏定义 N为数组的长度 thread_num表示每个块中的线程数
//GPU函数申明
__global__ void add(int *a, int *b, int *c);
//CPU函数申明
void add_CPU(int *a, int *b, int *c);
int main()
{
//GPU方法计时申明
float time_CPU, time_GPU;
cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
//CPU方法计时申明
float time_cpu, time_gpu;
clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;
int a[N], b[N], c[N],c_CPU[N];
int *dev_a, *dev_b, *dev_c;
//知道了每个块中所开的线程数量,那么我们就要知道我们需要开启多少个块
int block_num;
//线程块中的数量需要向上取整,下面两个操作有可以/*********/
//block_num = (N % thread_num == 0) ? (N / thread_num) : (N / thread_num + 1);
block_num = (N + thread_num - 1) / thread_num;
//在GPU上分配内存
cudaMalloc((void**)&dev_a, N*sizeof(int));
cudaMalloc((void**)&dev_b, N*sizeof(int));
cudaMalloc((void**)&dev_c, N*sizeof(int));
//在CPU上为a和b进行赋值操作
for (int i = 0; i < N; i++)
{
a[i] = -i;
b[i] = i*i;
}
//创建Event
cudaEventCreate(&start_CPU);
cudaEventCreate(&stop_CPU);
//记录当前时间
cudaEventRecord(start_CPU, 0);
start_cpu = clock();
//CPU计算/*******************************************************/
add_CPU(a, b, c_CPU);
stop_cpu = clock();
//记录当前时间
cudaEventRecord(stop_CPU, 0);
cudaEventSynchronize(start_CPU); //等待事件完成。
cudaEventSynchronize(stop_CPU); //等待事件完成。记录之前的任务
cudaEventElapsedTime(&time_CPU, start_CPU, stop_CPU); //计算时间差
printf("The time for CPU:\t%f(ms)\n", time_CPU);
cudaEventDestroy(start_CPU); //消除Event
cudaEventDestroy(stop_CPU);
//输出CPU结果
printf("\nResult from CPU:\n");
for (int i = 0; i < N; i++)
{
printf("CPU:\t%d+%d=%d\n",a[i],b[i],c_CPU[i]);
}
//GPU计算/*******************************************************/
//内存拷贝CPU-TO-GPU
cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);
//创建Event
cudaEventCreate(&start_GPU);
cudaEventCreate(&stop_GPU);
//记录当前时间
cudaEventRecord(start_GPU, 0);
start_gpu = clock();
//调用cuda的核函数
//printf("block_num=%d\tthread_num=%d\n", block_num, thread_num);
add << <block_num, thread_num >> >(dev_a, dev_b, dev_c);
stop_gpu = clock();
//记录当前时间
cudaEventRecord(stop_GPU, 0);
cudaEventSynchronize(start_GPU); //等待事件完成。
cudaEventSynchronize(stop_GPU); //等待事件完成。记录之前的任务
cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU); //计算时间差
printf("\nThe time for GPU:\t%f(ms)\n", time_GPU);
//将GPU中的结果拷贝出来
cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);
//输出GPU结果
printf("\nResult from GPU:\n");
for (int i = 0; i < N; i++)
{
printf("GPU:\t%d+%d=%d\n",a[i],b[i],c[i]);
}
cudaEventDestroy(start_GPU); //消除Event
cudaEventDestroy(stop_GPU);
//释放GPU上的内存
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("\nThe time for CPU by event:\t%f(ms)\n", time_CPU);
printf("The time for GPU by event:\t%f(ms)\n", time_GPU);
time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC;
printf("\nThe time for CPU by host:\t%f(ms)\n", time_cpu);
printf("The time for GPU by host:\t%f(ms)\n", time_gpu);
return 0;
}
//GPU函数
__global__ void add(int *a, int *b, int *c)
{
int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据
if (tid < N)
{
c[tid] = a[tid] + b[tid];
}
}
//CPU函数
void add_CPU(int *a, int *b, int *c)
{
for (int i = 0; i < N; i++)
{
c[i] = a[i] + b[i];
}
}
即可如类似eclipse一样build, debug