# time the PyCUDA versions

import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy 
from pycuda.compiler import SourceModule
import timeit

mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

SIZE=10000
a = numpy.random.randn(SIZE).astype(numpy.float32)
b = numpy.random.randn(SIZE).astype(numpy.float32)
dest = numpy.zeros_like(a)

block = (256,1,1)

    
def cpu(N,a,b,dest):
    start_time = timeit.default_timer()
    for i in range(N):
        dest[i] = a[i]*b[i]

    return (timeit.default_timer() - start_time) / N    

def vers_0(N,func,a,b,dest):
    # mem allocation on gpu side 
    a_gpu = drv.mem_alloc(a.nbytes)
    b_gpu = drv.mem_alloc(b.nbytes)
    dest_gpu = drv.mem_alloc(dest.nbytes)

    # Loop N times and print the time
    start_time = timeit.default_timer()
    for i in range(N):
        # data transfer to gpu # skipped
        drv.memcpy_htod(a_gpu,a)
        drv.memcpy_htod(b_gpu,b)
        

        multiply_them(
                dest_gpu, a_gpu, b_gpu,
                block=block)

        # mem copy from gpu to cpu 
        drv.memcpy_dtoh(dest,dest_gpu) 
    return (timeit.default_timer() - start_time) / N    

def vers_0_1(N,func,a,b,dest):
    # mem allocation on gpu side 
    a_gpu = drv.mem_alloc(a.nbytes)
    b_gpu = drv.mem_alloc(b.nbytes)
    dest_gpu = drv.mem_alloc(dest.nbytes)
    func.prepare("PPP")
    start_time = timeit.default_timer()
    for i in range(N):
        # data transfer to gpu # skipped
        drv.memcpy_htod(a_gpu,a)
        drv.memcpy_htod(b_gpu,b)

        ## Prepared Invocations to reduce overhead
        grid = (1,1)
        func.prepared_call(grid,block,dest_gpu,a_gpu,b_gpu)

        # mem copy from gpu to cpu 
        drv.memcpy_dtoh(dest,dest_gpu) 
    return (timeit.default_timer() - start_time) / N    

def vers_1(N,func,a,b,dest):
    # mem allocation on gpu side 
    a_gpu = drv.mem_alloc(a.nbytes)
    b_gpu = drv.mem_alloc(b.nbytes)
    dest_gpu = drv.mem_alloc(dest.nbytes)

    ## Prepared Invocations to reduce overhead
    start_time = timeit.default_timer()
    for i in range(N):
        multiply_them(
            drv.Out(dest), drv.In(a), drv.In(b),
            block=block)
    return (timeit.default_timer() - start_time) / N    


def vers_2(N,a,b,dest):
    a_gpu = gpuarray.to_gpu(a)
    b_gpu = gpuarray.to_gpu(b)
    start_time = timeit.default_timer()
    for i in range(N):
        dest = (a_gpu*b_gpu).get()
    return (timeit.default_timer() - start_time) / N    

    
# num iterations
N=500
    
print("Vector size: %s" % SIZE) 
    
print("Timing CPU")
elapsed = cpu(N,a,b,dest)
print("CPU elapsed: %3.3e" % elapsed)

print("Timing vers_0")
elapsed = vers_0(N,multiply_them,a,b,dest)
print("vers_0 elapsed: %3.3e" % elapsed)

print("Timing vers_0.1")
elapsed = vers_0_1(N,multiply_them,a,b,dest)
print("vers_0_1 elapsed: %3.3e" % elapsed)

print("Timing vers_1")
elapsed = vers_1(N,multiply_them,a,b,dest) 
print("vers_1 elapsed: %3.3e" % elapsed)

print("Timing vers_2")
elapsed = vers_2(N,a,b,dest) 
print("vers_2 elapsed: %3.3e" % elapsed)