#include #define RADIUS 3 #define BLOCK_SIZE 256 #define NUM_ELEMENTS (4096*2) // CUDA API error checking macro #define cudaCheck(error) \ if (error != cudaSuccess) { \ printf("Fatal error: %s at %s:%d\n", \ cudaGetErrorString(error), \ __FILE__, __LINE__); \ exit(1); \ } __global__ void stencil_1d(int *in, int *out) { __shared__ int temp[BLOCK_SIZE + 2 * RADIUS]; int gindex = threadIdx.x + (blockIdx.x * blockDim.x) + RADIUS; int lindex = threadIdx.x + RADIUS; // Read input elements into shared memory temp[lindex] = in[gindex]; if (threadIdx.x < RADIUS) { temp[lindex - RADIUS] = in[gindex - RADIUS]; temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE]; } // Make sure all threads get to this point before proceeding! __syncthreads(); // Apply the stencil int result = 0; for (int offset = -RADIUS ; offset <= RADIUS ; offset++) result += temp[lindex + offset]; // Store the result out[gindex-RADIUS] = result; } int main() { unsigned int i; int h_in[NUM_ELEMENTS + 2 * RADIUS], h_out[NUM_ELEMENTS]; int *d_in, *d_out; // Initialize host data for( i = 0; i < (NUM_ELEMENTS + 2*RADIUS); ++i ) h_in[i] = 1; // With a value of 1 and RADIUS of 3, all output values should be 7 // Allocate space on the device cudaCheck( cudaMalloc( &d_in, (NUM_ELEMENTS + 2*RADIUS) * sizeof(int)) ); cudaCheck( cudaMalloc( &d_out, NUM_ELEMENTS * sizeof(int)) ); // Copy input data to device cudaCheck( cudaMemcpy( d_in, h_in, (NUM_ELEMENTS + 2*RADIUS) * sizeof(int), cudaMemcpyHostToDevice) ); stencil_1d<<< (NUM_ELEMENTS + BLOCK_SIZE - 1)/BLOCK_SIZE, BLOCK_SIZE >>> (d_in, d_out); cudaCheck( cudaMemcpy( h_out, d_out, NUM_ELEMENTS * sizeof(int), cudaMemcpyDeviceToHost) ); // Verify every out value is 7 for( i = 0; i < NUM_ELEMENTS; ++i ) if (h_out[i] != 7) { printf("Element h_out[%d] == %d != 7\n", i, h_out[i]); break; } if (i == NUM_ELEMENTS) printf("SUCCESS!\n"); // Free out memory cudaFree(d_in); cudaFree(d_out); return 0; }