nimforum mirror - Parallel Fibonacci

DougT (orginal) [2022-12-27T17:19:31+01:00] view original

As a learning exercise in parallel computing, I would like to parallel compute the classic definition of the Fibonacci sequence. I realize there are many fast ways to compute the Fibonacci sequence, but I would like to use the classical definition as an exercise.

If I try


import threadpool
{.experimental: "parallel".}

proc fib(n:int) : int =
  if n < 2: n
  else: fib(n-1) + fib(n-2)

proc fib2(n:int) : int =
  var n1,n2: int
  
  if n > 20:
    parallel:
      n1 = spawn fib(n-1)
      n2 = spawn fib(n-2)
    n1 + n2
  else:
    fib(n)

proc main() =
  echo "fib(47) = ", fib2(47)

main()

then I run 2 threads and reduce the elapsed time about half. If I try more threads:


   n1 = spawn fib2(n-1)
   n2 = spawn fib2(n-2)

then I get incorrect answers or endless loops.

OCaml 5.0.0 user manual: "Domainslib provides an async/await mechanism for spawning parallel tasks and awaiting their results". In the OCaml user manual there is a Fibonacci parallel example that on my 6 core system provides a speed up of 7 times using 12 threads.

Is there some similar in nim 1.6 or 2.0 RC 1?

Araq (orginal) [2022-12-27T21:08:25+01:00] view original

You need to use a package that does this better than Nim's stdlib. For example:

https://github.com/status-im/nim-taskpools

https://github.com/mratsim/weave

https://github.com/yglukhov/asyncthreadpool

shirleyquirk (orginal) [2022-12-28T18:48:52+01:00] view original

import os,strutils
proc fib(n:int):int =
  if n < 2: n
  else:
    fib(n-1) + fib(n-2)

when defined(asyncthreadpool):
  const threadtype = "asyncthreadpool"
  import asyncthreadpool,asyncdispatch
  let t = newThreadPool()
  template mySpawn(x:untyped):untyped = t.spawn x
  template mySync(x:untyped):untyped = waitFor x
elif defined(taskpools):
  const threadtype = "taskpools"
  import taskpools,cpuinfo
  let t = Taskpool.new(countProcessors()*2)
  template mySpawn(x:untyped):untyped = t.spawn x
  template mySync(x:untyped):untyped = sync x
elif defined(weave):
  const threadtype = "weave"
  import weave
  init(Weave)
  template mySpawn(x:untyped):untyped = spawn x
  template mySync(x:untyped):untyped = sync x
elif defined(stdlib):
  const threadtype = "stdlib"
  template mySpawn(x:untyped):untyped = spawn x
  template mySync(x:untyped):untyped = ^x
else:
  const threadtype = "serial"
  template mySpawn(x:untyped):untyped = x
  template mySync(x:untyped):untyped = x


proc fib2(n:int):int =
  if n > 20:
    let
      n1 = mySpawn fib2(n-1)
      n2 = fib2(n-2)
    mySync(n1) + n2
  else:
    fib(n)


proc main() =
  let n = commandLineParams()[0].parseInt
  echo threadtype," fib(",n,") is",fib2(n)
  when defined(weave):
    exit(Weave)
main()

note that i've changed your example slightly so that tasks themselves spawn other tasks recursively.

results (nim 1.6.8 -d:release -d:lto --threads:on)

serial: 2.75 s

asyncthreadpool: i could get it to work for your original example, but not for the recursive spawning, i get "ValueError: no handles or timers registered in dispatch" or "OSError: Too many open files" i'm sure theres some solution involving newDispatcher but life is too short

stdlib:blocks forever for n >32ish, but not always. using preferSpawn helps a bit but not really

taskpools: 1.5s

weave: 1.2s

on devel:

fib2 needs to be marked {.gcsafe.}

serial 7.0s

taskpools 3.6s

weave 3.6s

using --mm:refc gives the previous speeds

Araq (orginal) [2022-12-28T19:59:17+01:00] view original

Hmm I wonder what causes these performance regressions.

DeletedUser (orginal) [2022-12-28T21:55:48+01:00] view original

Probably caused by exceptions:goto, this is the output of fib without arc on 1.6.10:

N_LIB_PRIVATE N_NIMCALL(NI, fib)(NI n) {
        NI result;
        result = (NI)0;
        {
                if (!(n < ((NI) 2))) goto LA3_;
                result = n;
        }
        goto LA1_;
        LA3_: ;
        {
                NI T6_;
                NI T7_;
                T6_ = (NI)0;
                T6_ = fib((NI)(n - ((NI) 1)));
                T7_ = (NI)0;
                T7_ = fib((NI)(n - ((NI) 2)));
                result = (NI)(T6_ + T7_);
        }
        LA1_: ;
        return result;
}

And with arc:

N_LIB_PRIVATE N_NIMCALL(NI, fib)(NI n) {
        NI result;
        NI colontmpD_;
        NI colontmpD__2;
NIM_BOOL* nimErr_;
{nimErr_ = nimErrorFlag();
        result = (NI)0;
        colontmpD_ = (NI)0;
        colontmpD__2 = (NI)0;
        {
                if (!(n < ((NI) 2))) goto LA3_;
                colontmpD_ = n;
                result = colontmpD_;
        }
        goto LA1_;
        LA3_: ;
        {
                NI T6_;
                NI T7_;
                T6_ = (NI)0;
                T6_ = fib((NI)(n - ((NI) 1)));
                if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
                T7_ = (NI)0;
                T7_ = fib((NI)(n - ((NI) 2)));
                if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
                colontmpD__2 = (NI)(T6_ + T7_);
                result = colontmpD__2;
        }
        LA1_: ;
        }BeforeRet_: ;
        return result;
}

shirleyquirk (orginal) [2022-12-29T00:42:03+01:00] view original

all of the difference is in the speed of the recursive fib and how well the c compiler can optimize that.

gcc	goto	setjmp
orc	6s	3.7s
refc	6s	1.6s
	clang
orc	21s	15s
refc	21s	15s

goto exceptions correlate with slow for this kind of code, but that difference between 3.6s and 1.6 is mysterious.: https://godbolt.org/z/fjM81nscK

generates the exact same machine code

in reality, gcc unrolls it into a spaghetti monster, and i guess orc code just spooks whatever heuristic gcc uses and makes it choose poorly.

adding {.inline.} makes things worse (setjmp: 5s, goto:10s for both refc and orc)

-d:danger (without inline) is great for goto, terrible for setjmp (setjmp 8s, goto: <2s for both)

but here's the table with fib declared {.inline.} and `-d:danger

gcc	goto	setjmp
orc	0.5s	8s
refc	0.5s	8s
	clang
orc	11s	10s
refc	11s	10s

so somehow gcc can tail-call with goto+inline and you get properly fast results.

jrfondren (orginal) [2022-12-29T02:04:15+01:00] view original

But what, I hear someone ask, if I want to get speed from functions like this without compiling everything with -d:danger?

Given fib.nim:

import std/[os, strutils]

{.push checks: off, optimization: speed, inline.}
proc fib(n:int):int{.raises:[].} =
  if n < 2: n
  else:
    fib(n-1) + fib(n-2)
{.pop.}

proc main() = echo commandLineParams()[0].parseInt.fib

main()

and fib2.nim:

import std/[os, strutils]

{.push inline.}
proc fib(n:int):int{.raises:[].} =
  if n < 2: n
  else:
    fib(n-1) + fib(n-2)
{.pop.}

proc main() = echo commandLineParams()[0].parseInt.fib

main()

When compiled as such:


$ nim c -d:release fib.nim
$ nim c -d:danger fib2.nim

This benchmark is consistent:

Summary
  './fib 47' ran
    1.10 ± 0.01 times faster than './fib2 47'

DougT (orginal) [2022-12-29T04:56:05+01:00] view original

Solution using taskpool

 nimble install taskpools

my test program:


# fib(47) = 2971215073
# without threads: 5.491s. this program with threads: 0.650s

import
  std/cpuinfo,
  taskpools

const nthreads = 12   # 6 core system, 12 hyper threads
var tp = Taskpool.new(num_threads = nthreads)

proc fib(n:int) : int =
  if n < 2: n
  else: fib(n-1) + fib(n-2)

proc fib2(n:int) : int =
  var pendingFuts = newSeq[FlowVar[int]](2)
  
  if n > 25:
    pendingFuts[0] = tp.spawn fib2(n-1)
    pendingFuts[1] = tp.spawn fib2(n-2)
    for k in 0..1:
      result += sync pendingFuts[k]
  else:
    result = fib(n)

proc main() =
  let n = 47
  echo "fib(",n,") = ",fib2(n)

main()

Mirror of forum.nim-lang.org

9768 :: Parallel Fibonacci