# time the PyCUDA versions import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy from pycuda.compiler import SourceModule import timeit mod = SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") SIZE=10000 a = numpy.random.randn(SIZE).astype(numpy.float32) b = numpy.random.randn(SIZE).astype(numpy.float32) dest = numpy.zeros_like(a) block = (256,1,1) def cpu(N,a,b,dest): start_time = timeit.default_timer() for i in range(N): dest[i] = a[i]*b[i] return (timeit.default_timer() - start_time) / N def vers_0(N,func,a,b,dest): # mem allocation on gpu side a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) dest_gpu = drv.mem_alloc(dest.nbytes) # Loop N times and print the time start_time = timeit.default_timer() for i in range(N): # data transfer to gpu # skipped drv.memcpy_htod(a_gpu,a) drv.memcpy_htod(b_gpu,b) multiply_them( dest_gpu, a_gpu, b_gpu, block=block) # mem copy from gpu to cpu drv.memcpy_dtoh(dest,dest_gpu) return (timeit.default_timer() - start_time) / N def vers_0_1(N,func,a,b,dest): # mem allocation on gpu side a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) dest_gpu = drv.mem_alloc(dest.nbytes) func.prepare("PPP") start_time = timeit.default_timer() for i in range(N): # data transfer to gpu # skipped drv.memcpy_htod(a_gpu,a) drv.memcpy_htod(b_gpu,b) ## Prepared Invocations to reduce overhead grid = (1,1) func.prepared_call(grid,block,dest_gpu,a_gpu,b_gpu) # mem copy from gpu to cpu drv.memcpy_dtoh(dest,dest_gpu) return (timeit.default_timer() - start_time) / N def vers_1(N,func,a,b,dest): # mem allocation on gpu side a_gpu = drv.mem_alloc(a.nbytes) b_gpu = drv.mem_alloc(b.nbytes) dest_gpu = drv.mem_alloc(dest.nbytes) ## Prepared Invocations to reduce overhead start_time = timeit.default_timer() for i in range(N): multiply_them( drv.Out(dest), drv.In(a), drv.In(b), block=block) return (timeit.default_timer() - start_time) / N def vers_2(N,a,b,dest): a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) start_time = timeit.default_timer() for i in range(N): dest = (a_gpu*b_gpu).get() return (timeit.default_timer() - start_time) / N # num iterations N=500 print("Vector size: %s" % SIZE) print("Timing CPU") elapsed = cpu(N,a,b,dest) print("CPU elapsed: %3.3e" % elapsed) print("Timing vers_0") elapsed = vers_0(N,multiply_them,a,b,dest) print("vers_0 elapsed: %3.3e" % elapsed) print("Timing vers_0.1") elapsed = vers_0_1(N,multiply_them,a,b,dest) print("vers_0_1 elapsed: %3.3e" % elapsed) print("Timing vers_1") elapsed = vers_1(N,multiply_them,a,b,dest) print("vers_1 elapsed: %3.3e" % elapsed) print("Timing vers_2") elapsed = vers_2(N,a,b,dest) print("vers_2 elapsed: %3.3e" % elapsed)