nimforum mirror - Success - calling custom CUDA kernels from Nim

mratsim (orginal) [2017-09-16T02:04:15+02:00] view original

Finally, I managed to call custom cuda kernels from Nim. Gist.

Well, basically I lost the fights to:

Metaprogramming Cuda from Nim. I would have to prevent Nim from "optimizing" the Cuda part and using CPU optimization.

Getting a pointer to the function on the GPU. My attempts returned the pointer to the function on the host which is useless …

Calling the GPU function directly by emitting Nim C code. I would have to emit the Cuda function in the same file so either as text or metaprogramming.

And ended up writing C but oh well.

The code:

square.cu

#include "square.cuh"

__global__ void square(float * d_out, float * d_in){
    int idx = threadIdx.x;
    float f = d_in[idx];
    d_out[idx] = f * f;
}


void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
    square<<<bpg,tpb>>>(d_out, d_in);
}

square.cuh

#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"



void cuda_square(int bpg, int tpb, float * d_out, float * d_in);

call_cuda.nim

import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future

type GpuArray[T: SomeReal] = object
  data: ref[ptr T]
  len: int

{.compile: "./square.cu".}
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".}
#../square.cuh is a workaround because header is not copied to nimcache
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock

proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
  let s = size * sizeof(T)
  check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
  if not p[].isNil:
    check cudaFree(p[])

proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
  new(result.data, deallocCuda)
  result.len = len
  result.data[] = cudaMalloc[T](result.len)

proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
  result = newGpuArray[T](s.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(result.data[],
                   unsafeAddr s[0],
                   size,
                   cudaMemcpyHostToDevice)

proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
  result = newSeq[T](g.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(addr result[0],
                   g.data[],
                   size,
                   cudaMemcpyDeviceToHost)


proc main() =
  let a = newSeq[float32](64)
  
  let b = toSeq(0..63).map(x => x.float32)
  
  echo a
  echo b
  
  var u = a.cuda
  let v = b.cuda
  
  cuda_square(1.cint, 64.cint, u.data[],v.data[])
  
  check cudaDeviceSynchronize()
  
  let z = u.cpu
  echo z

main()
## Output:

# @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
# @[0.0, 1.0, 2.0, 3.0, 4.0, ...]
# @[0.0, 1.0, 4.0, 9.0, 16.0, ...]

Thanks andrea, jcosborn and Araq in particular for tooling and inspiration.

mratsim (orginal) [2017-09-17T12:45:52+02:00] view original

Actually there is an even simpler code that avoids having to copy the header to ./nimcache (gist).

Note: VScode and github properly highlight the emit.

import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future

type GpuArray[T: SomeReal] = object
  data: ref[ptr T]
  len: int

{.emit: """
        __global__ void square(float * d_out, float * d_in){
            int idx = threadIdx.x;
            float f = d_in[idx];
            d_out[idx] = f * f;
        }
        
        
        void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
            square<<<bpg,tpb>>>(d_out, d_in);
        }
        
        """.}

proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc.}
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock

proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
  let s = size * sizeof(T)
  check cudaMalloc(cast[ptr pointer](addr result), s)

proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
  if not p[].isNil:
    check cudaFree(p[])

proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
  new(result.data, deallocCuda)
  result.len = len
  result.data[] = cudaMalloc[T](result.len)

proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
  result = newGpuArray[T](s.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(result.data[],
                   unsafeAddr s[0],
                   size,
                   cudaMemcpyHostToDevice)

proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
  result = newSeq[T](g.len)
  
  let size = result.len * sizeof(T)
  
  check cudaMemCpy(addr result[0],
                   g.data[],
                   size,
                   cudaMemcpyDeviceToHost)


proc main() =
  let a = newSeq[float32](64)
  
  let b = toSeq(0..63).map(x => x.float32)
  
  echo a
  echo b
  
  var u = a.cuda
  let v = b.cuda
  
  cuda_square(1.cint, 64.cint, u.data[],v.data[])
  
  check cudaDeviceSynchronize()
  
  let z = u.cpu
  echo z

main()
## Output:

# @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, ...]
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, ...]

Udiknedormin (orginal) [2017-09-18T09:59:14+02:00] view original

Could you explain why are you using ref[ptr T]? I'm not sure whether I really get it...

mratsim (orginal) [2017-09-18T22:42:20+02:00] view original

I have to get a GPU memory address with cudaMalloc which gives me a ptr T.

However I don't want to manually manage that memory so I wrap it in a ref that Nim GC will manage. I just pass it a finalizer proc (deallocCuda which calls official cudaFree) to make sure that when there is no more reference to that ptr T, it is deallocated.

So I have something like garbage-collected GPU memory object which is really neat.

andrea (orginal) [2017-09-19T00:16:34+02:00] view original

I had something similar, but my approach was just to compile calls to CUDA to a DLL and expose C functions to be called from Nim. I will see if I am able to open source something

Udiknedormin (orginal) [2017-09-19T18:51:56+02:00] view original

@mratsim Oh yes, I tend to forget Nim is garbage-collected as I don't use data structures that couldn't use RAII all that much. ^^" But is it really ok to not clean the GPU memory object? It seems to me it should always be deallocated but I'm not really sure. I ask because finalizers, contrary to destructors, are not required to actually be called:

type Sth = ref object

proc echoSth(x: Sth) = echo "Sth!"

var a: Sth
new(a, echoSth)
# echos nothing

As far as I know you need to explicitly demand calling finalizers if you really want them to be called:

type Sth = ref object

proc echoSth(x: Sth) = echo "Sth!"

var a: Sth
new(a, echoSth)
deallocHeap()
# echos: "Sth!"

andrea (orginal) [2017-09-19T22:05:35+02:00] view original

I agree there should be a finalizer to reclaim GPU memory - you can see an example here

mratsim (orginal) [2017-09-19T23:45:28+02:00] view original

I think the finalizer is called, or maybe Cuda has its own logic to catch linked program termination but I didn't restart my GPU/server for a week and I have no memory leak.

Udiknedormin (orginal) [2017-09-20T19:00:40+02:00] view original

@mratsim

Well, lucky you. ;) Try using some echo or something like that so you will know whether the finalizer was called or not.

I read some guy in the Internet who said he didn't explicitly deallocate his GPU memory but it worked anyway... But then he described how it subtly changed how his program behaved so I think you should be careful with assuming the finalizers were called for sure. Also, it seems to me that a more serious problem is that GC will ignore the ammount of free memory on GPU when deciding whether to deallocate your GPU object or not. It will only consider CPU memory. That seems a bigger problem, I guess.

mratsim (orginal) [2017-09-22T11:08:39+02:00] view original

I double-checked and the cuda driver releases all GPU memory allocated by a program upon termination. I also have addQuitProc to make sure I release all my handles.

I guess I'll also add a GC_fullCollect to the exit procedure.

Udiknedormin (orginal) [2017-09-22T19:02:31+02:00] view original

That's really good news. It means it should be 100% safe for programs not requiring more GPU memory than what your GPU can actually offer. For the ones that do, I would advise you to use regions or write RAII-like templates yourself as I don't think there is any way for Nim's GC to handle GPU memory (although maybe it's possible to write a separate GC implementation that does?).

Mirror of forum.nim-lang.org

3171 :: Success - calling custom CUDA kernels from Nim