nimforum mirror - Strange problem -- array is much slower than seq

Stefan_Salewski (orginal) [2017-06-14T11:31:20+02:00] view original

I have this very basic skeleton for a robin hood hash. It may contain bugs. But the strange thing is, that using one of these two lines makes a gigantic difference in time for running the tests: (nim v 0.17, gcc 6.3)

#var a: array[N2, string]
  var a = newSeq[string](N2)

import random
import hashes
import times
import math
import bitops
import strutils

template masked(i: untyped): typed {.dirty.}=
  i and m.hi

type
  Entry[K, V] = object
    k: K # key
    v: V # value
    p, n: int # previous, next
  
  Map[K, V] = object
    hv: seq[int] # hash value
    d: seq[Entry[K, V]] # data
    first, last: int
    numentries: int
    hi: int

proc hnz[K](x: K): Hash =
  result = hash($x)
  if result == 0:
    result = -1

proc newTable[K, V](initialCapacity: Natural; maxFillRate: range[10 .. 95] = 50): Map[K, V] =
  ## fillrate is the desired maximum fill of hash buffer in percent, value should
  ## be in the range 10 to 95%. Small value gives optimal performance.
  ## if capacity is a power of 2, then we start with a buffer of that size
  ## otherwise we create a initial buffer which can contain capacity elements
  ## taking into acount the desired maximum fill rate
  ## whenever actual fill rate is larger than desired value, buffer is increased
  ## by factor of two. Actual buffer size is always a power of two.
  # we may care a bit for arithmetic overflow on pure 32 bit: cap = 1e8, fillrate 100%
  var h = initialCapacity
  if countSetBits(h) > 1:
    if h < 1e7.int: # accuracy, but prevent overflow on 32 bit
      h = h * 100 div maxFillrate
    else:
      h = h div maxFillrate * 100
  # if h < 8: h = 8 # tiny values makes no real sense and may generate trouble
  h = nextPowerOfTwo(h)
  #result.data = newSeq[Entry[K, V]](result.hi)
  #dec(result.hi)
  result.numentries = 0
  #result.fillrate = maxFillrate
  #if result.hi < 1e7.int:
  #  result.cfr = result.hi * fillrate div 100
  #else:
  #  result.cfr = result.hi div 100 * fillrate
  #assert(result.cfr < result.hi)
  
  result.d = newSeq[Entry[K, V]](h)
  result.hv = newSeq[int](h)
  result.hi = h - 1

proc insert[K, V](m: var Map[K, V]; key: K; val: V) =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  var j = p
  while m.hv[masked(i)] != 0:
    inc(i)
  while true:
    j = i
    i = masked(i - 1)
    if i - masked(m.hv[i]) < i - p:
      m.hv[j] = m.hv[i]
      m.d[j] = m.d[i]
    else:
      break
  m.hv[j] = h
  m.d[j].v = val
  m.d[j].k = key
  m.d[j].n = m.first
  m.first = j

proc contains[K, V](m: Map[K, V]; key: K): bool =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  while true:
    let hv = m.hv[i]
    if hv == h:
      return m.d[i].k == key
    elif hv == 0 or masked(hv) > p:
      return false
    i = masked(i + 1)

proc get[K, V](m: Map[K, V]; key: K): V =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  while true:
    let hv = m.hv[i]
    if hv == h and m.d[i].k == key:
      return m.d[i].v
    elif hv == 0 or masked(hv) > p:
      when compiles($key):
        raise newException(KeyError, "key not found: " & $key)
      else:
        raise newException(KeyError, "key not found")
    i = masked(i + 1)

proc test() =
  const
    N = 1024 * 32
    N2 = N div 2
  var m = newTable[string, int](N)
  #var a: array[N2, string]
  var a = newSeq[string](N2)
  for i in 0 .. a.high: a[i] = $i
  a.shuffle
  
  #echo "aaa"
  for i in 0 .. a.len div 2 - 1:
    m.insert(a[i], parseInt(a[i]))
  #echo "bbb"
  for i in 0 .. a.len div 2 - 1:
    assert(m.contains(a[i]))
    assert(m.get(a[i]) == parseInt(a[i]))
  #echo "ccc"
  for i in a.len div 2 .. a.high:
    assert(not m.contains(a[i]))
  #GC_disable()
  m = newTable[string, int](N)
  var t = cpuTime()
  for p, v in a:
    m.insert(v, p)
  echo((cpuTime() - t) / a.len.float)
  #echo "ddd"
  t = cpuTime()
  for p, v in a:
    doassert(m.get(v) >= 0)
  echo((cpuTime() - t) / a.len.float)

when isMainModule:
  test()


$ time ./t
5.877685546874999e-08
3.784179687499999e-08

real	0m0.005s
user	0m0.005s
sys	0m0.000s


$ time ./t
0.0004289649047851563
4.602050781242744e-08

real	0m12.911s
user	0m12.906s
sys	0m0.000s

My initial guess was a stack overflow, so that the code is not working properly at all. But when I compile with plain "nim c t.nim" I get no runtime error. Other idea is passing by value vs passing by reference, but I can not see it currently.

Parashurama (orginal) [2017-06-14T13:39:44+02:00] view original

It is likely an issue with the GC. An easy way to detect where your program spend the most time is to send a SIGINT (CTRL+C on the terminal) and analyse the traceback.

I repeated the operation a few times and it was always inside some method (grow, CellSet) in gc.nim I quickly checked with another GC --gc:markAndSweep and the problem doesn't occur.

It is roughly as fast as with a seq

Stefan_Salewski (orginal) [2017-06-14T15:00:34+02:00] view original

Thanks for your investigations. GC was really not on my list of possible sources.

(And indeed, with --gc:markAndSweep all is fine!)

andrea (orginal) [2017-06-14T15:41:21+02:00] view original

Why should using an array trigger more collections?

Araq (orginal) [2017-06-14T16:45:07+02:00] view original

The huge array is put onto the stack, the stack is scanned conservatively by the GC, everything is much slower. Nothing mysterious going on here and it's not a GC bug either.

adrianv (orginal) [2017-06-15T12:09:11+02:00] view original

Good to know. Than any consideration to have data allocated on the stack to avoid allocations, turns out as a performance killer as long as the GC plays its game in the background.

Stefan_Salewski (orginal) [2017-06-15T12:45:10+02:00] view original

Good to know.

Better to know exactly. (Well soon we may get the book from Manning, they took a half year to elaborate Dom's draft, so hopefully they explain all these details... :-)

But seriously, I assume your conclusion is not correct. I myself am still surprised by that behaviour, and I think it may be a trap for many beginners. But I think the special case of my code is, that the array contains strings, and that strings are watched by the GC. My code was only intended for initial testing, generally I would tend to avoid such large objects on the stack, primary to avoid stack overflows. And I have to admit that I do not understand currently why the seq is no problem for the GC, as it also contains all the strings.

adrianv (orginal) [2017-06-15T15:15:43+02:00] view original

you are right, my conclusion was to simplistic. It doesn't matter if an array on the stack holds non ref data.

I do not understand currently why the seq is no problem for the GC

I don't know either, but I guess its because the GC has to check all the roots on the stack. For the seq it has only to check 1 root (the strings inside the seq are ref counted), for the array it has to check 16k roots.

adrianv (orginal) [2017-06-15T15:26:54+02:00] view original

I just made a small test and increased the ref count of each string in the array at the beginning and decreased it at the end. The result is, that the time of array variant decreased from 15sec to 2 sec on my PC.

import random
import hashes
import times
import math
import bitops
import strutils

template masked(i: untyped): typed {.dirty.}=
  i and m.hi

type
  Entry[K, V] = object
    k: K # key
    v: V # value
    p, n: int # previous, next
  
  Map[K, V] = object
    hv: seq[int] # hash value
    d: seq[Entry[K, V]] # data
    first, last: int
    numentries: int
    hi: int

proc hnz[K](x: K): Hash =
  result = hash($x)
  if result == 0:
    result = -1

proc newTable[K, V](initialCapacity: Natural; maxFillRate: range[10 .. 95] = 50): Map[K, V] =
  ## fillrate is the desired maximum fill of hash buffer in percent, value should
  ## be in the range 10 to 95%. Small value gives optimal performance.
  ## if capacity is a power of 2, then we start with a buffer of that size
  ## otherwise we create a initial buffer which can contain capacity elements
  ## taking into acount the desired maximum fill rate
  ## whenever actual fill rate is larger than desired value, buffer is increased
  ## by factor of two. Actual buffer size is always a power of two.
  # we may care a bit for arithmetic overflow on pure 32 bit: cap = 1e8, fillrate 100%
  var h = initialCapacity
  if countSetBits(h) > 1:
    if h < 1e7.int: # accuracy, but prevent overflow on 32 bit
      h = h * 100 div maxFillrate
    else:
      h = h div maxFillrate * 100
  # if h < 8: h = 8 # tiny values makes no real sense and may generate trouble
  h = nextPowerOfTwo(h)
  #result.data = newSeq[Entry[K, V]](result.hi)
  #dec(result.hi)
  result.numentries = 0
  #result.fillrate = maxFillrate
  #if result.hi < 1e7.int:
  #  result.cfr = result.hi * fillrate div 100
  #else:
  #  result.cfr = result.hi div 100 * fillrate
  #assert(result.cfr < result.hi)
  
  result.d = newSeq[Entry[K, V]](h)
  result.hv = newSeq[int](h)
  result.hi = h - 1

proc insert[K, V](m: var Map[K, V]; key: K; val: V) =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  var j = p
  while m.hv[masked(i)] != 0:
    inc(i)
  while true:
    j = i
    i = masked(i - 1)
    if i - masked(m.hv[i]) < i - p:
      m.hv[j] = m.hv[i]
      m.d[j] = m.d[i]
    else:
      break
  m.hv[j] = h
  m.d[j].v = val
  m.d[j].k = key
  m.d[j].n = m.first
  m.first = j

proc contains[K, V](m: Map[K, V]; key: K): bool =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  while true:
    let hv = m.hv[i]
    if hv == h:
      return m.d[i].k == key
    elif hv == 0 or masked(hv) > p:
      return false
    i = masked(i + 1)

proc get[K, V](m: Map[K, V]; key: K): V =
  let h = hnz(key)
  let p = masked(h)
  var i = p
  while true:
    let hv = m.hv[i]
    if hv == h and m.d[i].k == key:
      return m.d[i].v
    elif hv == 0 or masked(hv) > p:
      when compiles($key):
        raise newException(KeyError, "key not found: " & $key)
      else:
        raise newException(KeyError, "key not found")
    i = masked(i + 1)

proc test() =
  const
    N = 1024 * 32
    N2 = N div 2
  var m = newTable[string, int](N)
  var a: array[N2, string]
  #var a = newSeq[string](N2)
  for i in 0 .. a.high: a[i] = $i
  # increase the ref count of each string in the array
  for i in 0 .. a.high: GC_ref(a[i])
  a.shuffle
  
  #echo "aaa"
  for i in 0 .. a.len div 2 - 1:
    m.insert(a[i], parseInt(a[i]))
  #echo "bbb"
  for i in 0 .. a.len div 2 - 1:
    assert(m.contains(a[i]))
    assert(m.get(a[i]) == parseInt(a[i]))
  #echo "ccc"
  for i in a.len div 2 .. a.high:
    assert(not m.contains(a[i]))
  #GC_disable()
  m = newTable[string, int](N)
  var t = cpuTime()
  for p, v in a:
    m.insert(v, p)
  echo((cpuTime() - t) / a.len.float)
  #echo "ddd"
  t = cpuTime()
  for p, v in a:
    doassert(m.get(v) >= 0)
  echo((cpuTime() - t) / a.len.float)
  # decrease it
  for i in 0 .. a.high: GC_unref(a[i])

when isMainModule:
  test()

Araq (orginal) [2017-06-15T15:30:54+02:00] view original

You might as well disable the GC then during the time the big array is on the stack. I don't see anything refuting my educated guess.

adrianv (orginal) [2017-06-15T16:13:53+02:00] view original

Its not about refuting your educated guess - because you know and we are only guessing. On the contrary its only for the understanding for the simpler minds like me :) With increasing the ref count for the strings in the array I partly disabled the GC for these strings but not for the other GCed data. That's why it is faster than the array version, but still much slower than the seq version - I guess.

Mirror of forum.nim-lang.org

3001 :: Strange problem -- array is much slower than seq