nimforum mirror

CircArgs (orginal) [2022-09-24T01:38:14+02:00] view original

Playing with nimcuda and having trouble with deploying a kernel using cudaLaunchKernel.

Here is the full example (all credit to mratsim in this post: https://forum.nim-lang.org/t/3171)

import nimcuda/[cuda_runtime_api, driver_types, nimcuda, vector_types]
import sequtils, sugar

type GpuArray[T] = object
  data: ref[ptr T]
  len: int

{.emit: """
        __global__ void cuda_square(float * d_out, float * d_in){
            int idx = threadIdx.x;
            float f = d_in[idx];
            d_out[idx] = f * f;
        }
        """.}

proc cuda_square(y: ptr cfloat, x: ptr cfloat) {.importc, nodecl.}
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock

proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
  let s = size * sizeof(T)
  check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
  if not p[].isNil:
    check cudaFree(p[])

proc newGpuArray[T](len: int): GpuArray[T] {.noSideEffect.}=
  new(result.data, deallocCuda)
  result.len = len
  result.data[] = cudaMalloc[T](result.len)

proc cuda[T](s: seq[T]): GpuArray[T] {.noSideEffect.}=
  result = newGpuArray[T](s.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(result.data[],
                   unsafeAddr s[0],
                   size,
                   cudaMemcpyHostToDevice)

proc cpu[T](g: GpuArray[T]): seq[T] {.noSideEffect.}=
  result = newSeq[T](g.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(addr result[0],
                   g.data[],
                   size,
                   cudaMemcpyDeviceToHost)


proc main() =
  let a = newSeq[float32](64)
  
  let b = toSeq(0..63).map(x => x.float32)
  
  echo a
  echo b
  
  var u = a.cuda
  let v = b.cuda
  let args = [u, v]
  check cudaLaunchKernel(cuda_square, dim3(x: 1, y: 1, z: 1), dim3(x: 64, y: 1, z: 1), cast[ptr pointer](unsafeAddr args), 0, nil)
  
  check cudaDeviceSynchronize()
  
  let z = u.cpu
  echo z

main()
## Output:

# @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, ...]
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, ...]

I am able to build this with nvcc (by being careful about the flags) but my output is

@[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
@[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0]
nimcuda.nim(50)          main
Error: unhandled exception: 98 (invalid data!) 98 [CudaError]

the c code for the args looks fine to me:

args[0] = (*u.data);
args[1] = (*v.data);
yX60gensym20_ = Dl_1006633115_(((void*) (cuda_square)), TM__86QDd8eYh9bOeq3xkQRr9aKg_12, TM__86QDd8eYh9bOeq3xkQRr9aKg_13, ((void**) (args)), ((NI) 0), NIM_NIL);

Is there a problem with the way I have setup passing args or is there some other issue?

Mirror of forum.nim-lang.org

9489 :: nimcuda