#include#include#include#include//Definition of kernel function to add two variables
__global__ void gpuAdd(int d_a, int d_b, int *d_c)
{*d_c = d_a + d_b;
//main function
int main()
{//Defining host variable to store answer
int h_c;
//Defining device pointer
int *d_c;
//Allocating memory for device pointer
cudaMalloc((void**)&d_c, sizeof(int));
//Kernel call by passing 1 and 4 as inputs and storing answer in d_c
//<<<1,1>>>means 1 block is executed with 1 thread per block
gpuAdd<<<1, 1 >>>(1, 4, d_c);
//Copy result from device memory to host memory
cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("1 + 4 = %d\n", h_c);
//Free up memory
return 0;
在 main 函数中,前两行定义主机和设备的变量。第三行使用cudaMalloc 函数在设备上分配d_c变量的内存。cudaMalloc 函数类似于C中的malloc 函数。在 main 函数中调用 gpuAdd,其中1和4 是两个输人变量,d_c是一个作为输出指针变量的设备显存指针。如果 gpuAdd 的结果需要在主机上使用,那么它必须从设备的内存复制到主机的内存中,这是由 cudaMemcpy函数完成的。然后,使用 printf 函数打印这个结果。倒数第二行使用cudaFree 函数释放设备上使用的内存。从程序中释放设备上使用的所有内存是非常重要的,否则,你可能在某个时候耗尽内存。
双变量加法程序有两个函数:main 和gpuAdd。如你所见,gpuAdd 是通过使用__global__关键字定义的,因此它用于在设备上执行,而 main 函数将在主机上执行。这个程序将设备上的两个变量相加,并在命令行上打印输出。
#include#include#include#include//Kernel function to add two variables, parameters are passed by reference
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c)
{*d_c = *d_a + *d_b;
int main()
{//Defining host variables
int h_a,h_b, h_c;
//Defining Device Pointers
int *d_a,*d_b,*d_c;
//Initializing host variables
h_a = 1;
h_b = 4;
//Allocating memory for Device Pointers
cudaMalloc((void**)&d_a, sizeof(int));
cudaMalloc((void**)&d_b, sizeof(int));
cudaMalloc((void**)&d_c, sizeof(int));
//Coping value of host variables in device memory
cudaMemcpy(d_a, &h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(int), cudaMemcpyHostToDevice);
//Calling kernel with one thread and one block with parameters passed by reference
gpuAdd<<<1, 1 >>>(d_a, d_b, d_c);
//Coping result from device memory to host
cudaMemcpy(&h_c, d_c, sizeof(int), cudaMemcpyDeviceToHost);
printf("Passing Parameter by Reference Output: %d + %d = %d\n", h_a, h_b, h_c);
//Free up memory
return 0;