import pycuda.driver as drv import pycuda.autoinit import numpy from pycuda.compiler import SourceModule mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) # mem allocation on gpu side a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) dest_gpu = drv.mem_alloc(dest.nbytes) # data transfer to gpu # skipped drv.memcpy_htod(a_gpu,a) drv.memcpy_htod(b_gpu,b) multiply_them( dest_gpu, a_gpu, b_gpu, block=(400,1,1)) # mem copy from gpu to cpu drv.memcpy_dtoh(dest,dest_gpu) print(dest-a*b)