I have serached this forum, and checked the doc
however, when I try to compile the following code by nim c -d:release -d:speed --stackTrace:off --opt:speed --passL:-s --passc:-flto --passc:-fopenmp --passL:-fopenmp -d:openmp a_openmp I get
...............................................................................fatal.nim(53) sysFatal
Error: unhandled exception: index out of bounds, the container is empty [IndexDefect]
why? Thanks
import std/math
import std/times
var
total: int32 = 1
beginCount: int32 = 3
endCount: int32 = 1000_0000
cacheSize: int32 = 400
cachePtr: int32 = -1
primesCache: array[400, int32]
proc isPrime(n:int32): bool =
var cur = int32(0)
for i in 0 .. cachePtr:
cur = primesCache[i]
if n mod cur == 0:
return false
var m = int32(sqrt(float(n)))
while true:
cur += 2
if cur > m:
break
if n mod cur == 0:
return false
if cachePtr < cacheSize-1:
cachePtr += 1
primesCache[cachePtr] = n
return true
let time=cpuTime()
var i = beginCount
for i in `||`(beginCount, endCount):
if (i mod 2) == 0:
continue;
if isPrime(i):
total += 1
let endTime = cpuTime() - time
echo "Time taken: ", endTime, "s"
echo total
-d:openmp doesn't do anything by itself. if you saw that from Arraymancer it's because I have code like
when defined(openmp):
doSomething()
Any variable that is shared among threads MUST deal with thread synchronization:
Like here: https://github.com/mratsim/Arraymancer/blob/6cfe743/src/arraymancer/tensor/backend/openmp.nim#L18-L34, you need a per thread-cache or they'll mess each other up.
when defined(openmp):
when not defined(cuda): # For cuda, OpenMP flags must be passed
{.passC: "-fopenmp".} # behind -Xcompiler -fopenmp
{.passL: "-fopenmp".}
{.pragma: omp, header:"omp.h".}
proc omp_set_num_threads*(x: cint) {.omp.}
proc omp_get_num_threads*(): cint {.omp.}
proc omp_get_max_threads*(): cint {.omp.}
proc omp_get_thread_num*(): cint {.omp.}
else:
template omp_set_num_threads*(x: cint) = discard
template omp_get_num_threads*(): cint = 1
template omp_get_max_threads*(): cint = 1
template omp_get_thread_num*(): cint = 0
What's very possible is that due to lack of synchronization, cachePtr was incremented beyond 400.