Dear all,
I've timed the parallel example from the docs and found that it executes much slower than a single threaded version. With top -H I can see that threads are idle or underutilized. See below for the code. I've bumped the number of iterations to 1,000,000, set the number of threads to the number of processors on my machine (also tried with the default) and compiled with nim c -d:release and --threads:on as appropriate.
Wall clock times:
Am I misusing this code or misunderstanding something?
Thanks, Andreas
No threading:
import strutils, math proc term(k: float): float = 4 * math.pow(-1, k) / (2*k + 1) proc pi(n: int): float = var ch = newSeq[float](n+1) for k in 0..ch.high: ch[k] = term(float(k)) for k in 0..ch.high: result += ch[k] echo formatFloat(pi(1000000))
Parallel version:
# Compute PI in an inefficient way import strutils, math import threadpool import cpuinfo {.experimental: "parallel".} let nProc = countProcessors() setMaxPoolSize(nProc) proc term(k: float): float = 4 * math.pow(-1, k) / (2*k + 1) proc pi(n: int): float = var ch = newSeq[float](n+1) parallel: for k in 0..ch.high: ch[k] = spawn term(float(k)) for k in 0..ch.high: result += ch[k] echo formatFloat(pi(1000000))
i create countProcessors() (CP) threads each one of them processes a chunk of ch.len/CP size
proc mt_pi(n: int): float =
proc term(i,n:int, ch:var seq[float]) =
let
size = ch.len
chunk_sz = size div n
rfrom = i * chunk_sz
rto = if (i+1) * chunk_sz > size: size else: (i+1) * chunk_sz
for index in rfrom..<rto: # process in this thread a chunk
ch[index] = 4 * math.pow(-1, index.float) / (2*index.float + 1)
let nth = countProcessors()
var ch = newSeq[float](n+1)
parallel:
for k in 0..nth:
spawn term(k, nth, ch)
for k in 0..ch.high: result += ch[k]
first attempt to a more general solution, i'm considering rayon rust module parallel implementation which contains parallel iterators, maps, etc.
proc par_apply*[T](v:var seq[T], fnc:proc(i:int):T)=
proc chunk_range(size, i, nth: int): Slice[int] =
let
chunk_sz = size div nth
rfrom = i * chunk_sz
rto = if (i+1) * chunk_sz > size: size else: (i+1) * chunk_sz
rfrom..<rto
proc chunk_apply(fnc:proc(i:int):T, i, n : int, v:var seq[T]) =
for i in chunk_range(size=v.len, i, n):
v[i] = fnc(i)
let nth = countProcessors()
parallel:
for i in 0..nth:
spawn chunk_apply(fnc, i, nth, v)