#include "cuda_runtime.h" #include "device_launch_parameters.h" #include __global__ void add(int *a, int *b, int *c) { /* insert code to calculate the index properly using blockIdx.x, blockDim.x, threadIdx.x */ int index = FIXME c[index] = a[index] + b[index]; } /* experiment with N */ /* how large can it be? */ #define N (2048*2048) #define THREADS_PER_BLOCK 512 int main() { int *a, *b, *c; int *d_a, *d_b, *d_c; int size = N * sizeof( int ); /* allocate space for device copies of a, b, c */ cudaMalloc( (void **) &d_a, size ); cudaMalloc( (void **) &d_b, size ); cudaMalloc( (void **) &d_c, size ); /* allocate space for host copies of a, b, c and setup input values */ a = (int *)malloc( size ); b = (int *)malloc( size ); c = (int *)malloc( size ); for( int i = 0; i < N; i++ ) { a[i] = b[i] = i; c[i] = 0; } /* copy inputs to device */ cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ); cudaMemcpy( d_b, b, size, cudaMemcpyHostToDevice ); /* launch the kernel on the GPU */ /* insert the launch parameters to launch the kernel properly using blocks and threads */ add<<< FIXME, FIXME >>>( d_a, d_b, d_c ); /* copy result back to host */ cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ); printf( "c[0] = %d\n",0,c[0] ); printf( "c[%d] = %d\n",N-1, c[N-1] ); /* clean up */ free(a); free(b); free(c); cudaFree( d_a ); cudaFree( d_b ); cudaFree( d_c ); return 0; } /* end main */