cuda例子

因为我们装cuda到linux机器后, 会附带cuda工具nsight (并且之前装好了jdk 8)
首先用命令行启动nsight, 建立项目

1
2
3

dassein@pad:~$ nsight
Java HotSpot(TM) 64-Bit Server VM warning: ignoring option MaxPermSize=256m; support was removed in 8.0
CompilerOracle: exclude java/lang/reflect/Array.newInstance

粘贴例子代码如下

/*
矢量求和运算
a[i]+b[i]=c[i];
*/

//CUDA的头文件
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
//C语言的头文件
#include "stdio.h"
#include "time.h"

//宏定义 N为数组的长度 thread_num表示每个块中的线程数
#define N 60000

#define thread_num  1024

//GPU函数申明
__global__ void add(int *a, int *b, int *c);

//CPU函数申明
void add_CPU(int *a, int *b, int *c);

int main()
{
    //GPU方法计时申明
    float time_CPU, time_GPU;
    cudaEvent_t start_GPU, stop_GPU, start_CPU, stop_CPU;
    //CPU方法计时申明
    float time_cpu, time_gpu;
    clock_t start_cpu, stop_cpu, start_gpu, stop_gpu;


    int a[N], b[N], c[N],c_CPU[N];
    int *dev_a, *dev_b, *dev_c;
    //知道了每个块中所开的线程数量，那么我们就要知道我们需要开启多少个块
    int block_num;

    //线程块中的数量需要向上取整，下面两个操作有可以/*********/
    //block_num = (N % thread_num == 0) ? (N / thread_num) : (N / thread_num + 1);
    block_num = (N + thread_num - 1) / thread_num;

    //在GPU上分配内存
    cudaMalloc((void**)&dev_a, N*sizeof(int));
    cudaMalloc((void**)&dev_b, N*sizeof(int));
    cudaMalloc((void**)&dev_c, N*sizeof(int));

    //在CPU上为a和b进行赋值操作
    for (int i = 0; i < N; i++)
    {
        a[i] = -i;
        b[i] = i*i;
    }


    //创建Event
    cudaEventCreate(&start_CPU);
    cudaEventCreate(&stop_CPU);
    //记录当前时间
    cudaEventRecord(start_CPU, 0);
    start_cpu = clock();
    //CPU计算/*******************************************************/
    add_CPU(a, b, c_CPU);
    stop_cpu = clock();
    //记录当前时间
    cudaEventRecord(stop_CPU, 0);
    cudaEventSynchronize(start_CPU);    //等待事件完成。
    cudaEventSynchronize(stop_CPU);    //等待事件完成。记录之前的任务
    cudaEventElapsedTime(&time_CPU, start_CPU, stop_CPU);    //计算时间差
    printf("The time for CPU:\t%f(ms)\n", time_CPU);

    cudaEventDestroy(start_CPU);    //消除Event
    cudaEventDestroy(stop_CPU);

    //输出CPU结果
    printf("\nResult from CPU:\n"); 
    for (int i = 0; i < N; i++)
    {
        printf("CPU:\t%d+%d=%d\n",a[i],b[i],c_CPU[i]);
    }


    //GPU计算/*******************************************************/
    //内存拷贝CPU-TO-GPU
    cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice);

    //创建Event
    cudaEventCreate(&start_GPU);
    cudaEventCreate(&stop_GPU);
    //记录当前时间
    cudaEventRecord(start_GPU, 0);

    start_gpu = clock();
    //调用cuda的核函数
    //printf("block_num=%d\tthread_num=%d\n", block_num, thread_num);
    add << <block_num, thread_num >> >(dev_a, dev_b, dev_c);

    stop_gpu = clock();

    //记录当前时间
    cudaEventRecord(stop_GPU, 0);
    cudaEventSynchronize(start_GPU);    //等待事件完成。
    cudaEventSynchronize(stop_GPU);    //等待事件完成。记录之前的任务
    cudaEventElapsedTime(&time_GPU, start_GPU, stop_GPU);    //计算时间差
    printf("\nThe time for GPU:\t%f(ms)\n", time_GPU);

    //将GPU中的结果拷贝出来
    cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost);    

    //输出GPU结果
    printf("\nResult from GPU:\n");
    for (int i = 0; i < N; i++)
    {
        printf("GPU:\t%d+%d=%d\n",a[i],b[i],c[i]);
    }


    cudaEventDestroy(start_GPU);    //消除Event
    cudaEventDestroy(stop_GPU);

    //释放GPU上的内存
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);


    printf("\nThe time for CPU by event:\t%f(ms)\n", time_CPU);
    printf("The time for GPU by event:\t%f(ms)\n", time_GPU);

    time_cpu = (float)(stop_cpu - start_cpu) / CLOCKS_PER_SEC;
    time_gpu = (float)(stop_gpu - start_gpu) / CLOCKS_PER_SEC;
    printf("\nThe time for CPU by host:\t%f(ms)\n", time_cpu);
    printf("The time for GPU by host:\t%f(ms)\n", time_gpu);

    return 0;
}

//GPU函数
__global__ void add(int *a, int *b, int *c)
{
    int tid = blockIdx.x*blockDim.x+threadIdx.x;//计算该索引处的数据
    if (tid < N)
    {
        c[tid] = a[tid] + b[tid];
    }
}

//CPU函数
void add_CPU(int *a, int *b, int *c)
{
    for (int i = 0; i < N; i++)
    {
        c[i] = a[i] + b[i];
    }
}

即可如类似eclipse一样build, debug