Finally, I managed to call custom cuda kernels from Nim. Gist.
Well, basically I lost the fights to:
And ended up writing C but oh well.
The code:
square.cu
#include "square.cuh"
__global__ void square(float * d_out, float * d_in){
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
square<<<bpg,tpb>>>(d_out, d_in);
}
square.cuh
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
void cuda_square(int bpg, int tpb, float * d_out, float * d_in);
call_cuda.nim
import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future
type GpuArray[T: SomeReal] = object
data: ref[ptr T]
len: int
{.compile: "./square.cu".}
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc, header:"../square.cuh".}
#../square.cuh is a workaround because header is not copied to nimcache
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock
proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
let s = size * sizeof(T)
check cudaMalloc(cast[ptr pointer](addr result), s)
proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
if not p[].isNil:
check cudaFree(p[])
proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
new(result.data, deallocCuda)
result.len = len
result.data[] = cudaMalloc[T](result.len)
proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
result = newGpuArray[T](s.len)
let size = result.len * sizeof(T)
check cudaMemCpy(result.data[],
unsafeAddr s[0],
size,
cudaMemcpyHostToDevice)
proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
result = newSeq[T](g.len)
let size = result.len * sizeof(T)
check cudaMemCpy(addr result[0],
g.data[],
size,
cudaMemcpyDeviceToHost)
proc main() =
let a = newSeq[float32](64)
let b = toSeq(0..63).map(x => x.float32)
echo a
echo b
var u = a.cuda
let v = b.cuda
cuda_square(1.cint, 64.cint, u.data[],v.data[])
check cudaDeviceSynchronize()
let z = u.cpu
echo z
main()
## Output:
# @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
# @[0.0, 1.0, 2.0, 3.0, 4.0, ...]
# @[0.0, 1.0, 4.0, 9.0, 16.0, ...]
Thanks andrea, jcosborn and Araq in particular for tooling and inspiration.
Actually there is an even simpler code that avoids having to copy the header to ./nimcache (gist).
Note: VScode and github properly highlight the emit.
import nimcuda/[cuda_runtime_api, driver_types, nimcuda]
import sequtils, future
type GpuArray[T: SomeReal] = object
data: ref[ptr T]
len: int
{.emit: """
__global__ void square(float * d_out, float * d_in){
int idx = threadIdx.x;
float f = d_in[idx];
d_out[idx] = f * f;
}
void cuda_square(int bpg, int tpb, float * d_out, float * d_in){
square<<<bpg,tpb>>>(d_out, d_in);
}
""".}
proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc.}
## Compute the square of x and store it in y
## bpg: BlocksPerGrid
## tpb: ThreadsPerBlock
proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}=
let s = size * sizeof(T)
check cudaMalloc(cast[ptr pointer](addr result), s)
proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}=
if not p[].isNil:
check cudaFree(p[])
proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}=
new(result.data, deallocCuda)
result.len = len
result.data[] = cudaMalloc[T](result.len)
proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}=
result = newGpuArray[T](s.len)
let size = result.len * sizeof(T)
check cudaMemCpy(result.data[],
unsafeAddr s[0],
size,
cudaMemcpyHostToDevice)
proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}=
result = newSeq[T](g.len)
let size = result.len * sizeof(T)
check cudaMemCpy(addr result[0],
g.data[],
size,
cudaMemcpyDeviceToHost)
proc main() =
let a = newSeq[float32](64)
let b = toSeq(0..63).map(x => x.float32)
echo a
echo b
var u = a.cuda
let v = b.cuda
cuda_square(1.cint, 64.cint, u.data[],v.data[])
check cudaDeviceSynchronize()
let z = u.cpu
echo z
main()
## Output:
# @[0.0, 0.0, 0.0, 0.0, 0.0, ...]
# @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, ...]
# @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, ...]
I have to get a GPU memory address with cudaMalloc which gives me a ptr T.
However I don't want to manually manage that memory so I wrap it in a ref that Nim GC will manage. I just pass it a finalizer proc (deallocCuda which calls official cudaFree) to make sure that when there is no more reference to that ptr T, it is deallocated.
So I have something like garbage-collected GPU memory object which is really neat.
@mratsim Oh yes, I tend to forget Nim is garbage-collected as I don't use data structures that couldn't use RAII all that much. ^^" But is it really ok to not clean the GPU memory object? It seems to me it should always be deallocated but I'm not really sure. I ask because finalizers, contrary to destructors, are not required to actually be called:
type Sth = ref object
proc echoSth(x: Sth) = echo "Sth!"
var a: Sth
new(a, echoSth)
# echos nothing
As far as I know you need to explicitly demand calling finalizers if you really want them to be called:
type Sth = ref object
proc echoSth(x: Sth) = echo "Sth!"
var a: Sth
new(a, echoSth)
deallocHeap()
# echos: "Sth!"
@mratsim
Well, lucky you. ;) Try using some echo or something like that so you will know whether the finalizer was called or not.
I read some guy in the Internet who said he didn't explicitly deallocate his GPU memory but it worked anyway... But then he described how it subtly changed how his program behaved so I think you should be careful with assuming the finalizers were called for sure. Also, it seems to me that a more serious problem is that GC will ignore the ammount of free memory on GPU when deciding whether to deallocate your GPU object or not. It will only consider CPU memory. That seems a bigger problem, I guess.
I double-checked and the cuda driver releases all GPU memory allocated by a program upon termination. I also have addQuitProc to make sure I release all my handles.
I guess I'll also add a GC_fullCollect to the exit procedure.