Good morning,
I'm noticing a slow-down of my code the more data I process. I think I've isolated the problem to the count table (result in proc get_min_frq). After loading ~ 10Million datum into the count table the code slows down, and likely stalls completely. So what have I tried?
type
Min* = uint64 # minimizer
minimizer_t* = object
minimizer*: Min
pos*: uint32
strand*: Strand
for i in mins:
inc(counter)
if win_min_idx < (counter - window):
win_min = high(uint64)
win_min_idx = -1
if i.minimizer < win_min:
win_min = i.minimizer
win_min_idx = counter
if counter < window:
continue
if last_min != win_min:
result.add(i)
lastMin = win_min
proc get_min_frq*(db_prefix: string): CountTable[Min] =
let db_pattern = "{db_prefix}*.min.msgpck".fmt
for db in walkPattern(db_pattern):
let mindb = load_min_db(db)
for k, v in mindb:
var min_1 = min_min(v.mins, 12)
var min_2 = min_min(min_1, 12)
for i in 0 .. (min_2.len - 2):
let nm: Min = (min_2[i].minimizer shl 32) or min_2[i+1].minimizer
result.inc(nm)
** just using a random uint64**
proc get_min_frq*(db_prefix: string): CountTable[Min] =
var r = initRand(2019)
stderr.writeLine("[INFO] counting N.M.P.s".fmt)
let db_pattern = "{db_prefix}*.min.msgpck".fmt
for db in walkPattern(db_pattern):
let mindb = load_min_db(db)
for k, v in mindb:
var min_1 = min_min(v.mins, 12)
var min_2 = min_min(min_1, 12)
for i in 0 .. (min_2.len - 2):
let nm: Min = (min_2[i].minimizer shl 32) or min_2[i+1].minimizer
result.inc(r.next())
stand alone time testing CountTable
import random
import times
import tables
import strformat
when isMainModule:
var ct = initCountTable[uint64]()
var r = initRand(2019)
for i in 0..50000000:
var before = now()
ct.inc(r.next())
var after = now()
echo "iteration i:{i} time = {after - before} ; size of ct = {ct.len}".fmt
Update the default CountTable (in nim 0.20.99) hash function was likely to Blame. Pre-hashing my datum fixed the problem.
min_hash(key: uint64; mask: uint64): uint64 =
var keym = (not key + (key << 21)) and mask; # keym = (keym << 21) - keym - 1;
keym = keym xor keym >> 24;
keym = ((keym + (keym << 3)) + (keym << 8)) and mask # keym * 265
keym = keym xor keym >> 14
keym = ((keym + (keym << 2)) + (keym << 4)) and mask # keym * 21
keym = keym xor keym >> 28
keym = (keym + (keym << 31)) and mask
return keym