#include "cuda_runtime.h" #include "device_launch_parameters.h" #include __global__ void add(int *a, int *b, int *c) { /* finish this code to calculate c element-wise from a and b where each block calculates one element */ c[FIXME] = a[FIXME] + b[FIXME]; } /* experiment with different values of N. */ /* how large can it be? */ #define N 32 int main() { int *a, *b, *c; int *d_a, *d_b, *d_c; int size = N * sizeof( int ); /* allocate space for device copies of a, b, c */ cudaMalloc( (void **) &d_a, size ); /* insert code here for d_b and d_c */ FIXME /* allocate space for host copies of a, b, c and setup input values */ a = (int *)malloc( size ); b = (int *)malloc( size ); c = (int *)malloc( size ); /* intializing a, b, c on host */ for( int i = 0; i < N; i++ ) { a[i] = b[i] = i; c[i] = 0; } /* copy inputs to device */ cudaMemcpy( d_a, a, size, cudaMemcpyHostToDevice ); /* insert code to copy b to the device */ FIXME /* launch the kernel on the GPU */ /* finish this kernel launch with N blocks and 1 thread per block */ add<<< FIXME, FIXME >>>( d_a, d_b, d_c ); /* copy result back to host */ cudaMemcpy( c, d_c, size, cudaMemcpyDeviceToHost ); for( int i = 0; i < N; i++ ) { printf("c[%d] = %d\n",i,c[i]); } /* end for */ /* clean up */ free(a); free(b); free(c); cudaFree( d_a ); cudaFree( d_b ); cudaFree( d_c ); return 0; } /* end main */