import pycuda.driver as drv
import pycuda.autoinit
import numpy
from pycuda.compiler import SourceModule

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)
dest = numpy.zeros_like(a)

# mem allocation on gpu side 
a_gpu = drv.mem_alloc(a.nbytes)
b_gpu = drv.mem_alloc(b.nbytes)
dest_gpu = drv.mem_alloc(dest.nbytes)

# data transfer to gpu # skipped
drv.memcpy_htod(a_gpu,a)
drv.memcpy_htod(b_gpu,b)

multiply_them(
        dest_gpu, a_gpu, b_gpu,
        block=(400,1,1))

# mem copy from gpu to cpu 
drv.memcpy_dtoh(dest,dest_gpu) 

print(dest-a*b)