Hello, I was testing a simple stream algorithm with a slice window, but performance is a bit disappointing! A python 3 version is actually in average 2x faster than the nim version.
My code:
import tables, times
const WORD_SIZE = 4
const K = 10000
iterator window(line: string, size: int): string =
for i in 0 .. line.len - size:
yield line[i .. i + size - 1]
proc counter(file: string, wordSize: int, counters: int): TableRef[string, int] =
var lines = ""
for line in lines(file):
lines.add(line)
echo "ready"
var counts = newTable[string, int]()
for word in window(lines, wordSize):
if counts.contains(word):
counts[word] += 1
elif counts.len < counters:
counts[word] = 1
else:
for pair in counts.pairs:
let (key, value) = pair
if value == 1:
counts.del(key)
else:
counts[key] -= 1
return counts
proc printTop(table: TableRef[string, int], top: int): void =
var sorted = newCountTable[string]()
for key in table.keys:
sorted[key] = table[key]
sorted.sort
var n = 0
for pair in sorted.pairs:
inc n
if n > top: break
let (key, value) = pair
echo $n, ": '", key, "' -> ", value
let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0
res.printTop(30)
I use this to process a 100MB text file. I'm betting the problem is on the string slice on the window iterator, probably constructing a string copy instead of reusing the reference! How can I improve this?
Also, why there isn't a del procedure in the CountTable type?
well you can create your own StringSlice type that does not copy and override the slicing:
type
StringSlice
src: ptr string
a,b: int
proc `$`(arg: StringSlice): string =
(arg.src[])[arg.a .. arg.b]
Is that correct?
type
StringSlice
src: ptr string
a,b: int
Gives me a compilation error! Is it instead:
type
StringSlice = object
src: ptr string
a,b: int
Also how can I use this with the type Table:
var t = newTable[StringSlice, int]()
Also gives me compilation error: type mismatch: got (StringSlice) For tables to work you need to define a hash function for your custom type:
proc hash(s: StringSlice): Hash
https://nim-lang.org/docs/hashes.html for details
I've come up with an optimized version which takes about 5 seconds on my machine, vs the 48 seconds that the Python script takes. This version doesn't print out the top words (that would take a bit more work, and another map of hashes to strings)
import tables, times, hashes
const WORD_SIZE = 4
const K = 10000
iterator window(line: string, size: int): Hash =
for start_pos in 0 .. line.len - size:
let end_pos = start_pos + size - 1
yield hash(line, start_pos, end_pos)
proc counter(file: string, wordSize: int, counters: int): TableRef[Hash, int] =
var lines = ""
for line in lines(file):
lines.add(line)
echo "ready"
var counts = newTable[Hash, int]()
for word_hash in window(lines, wordSize):
if counts.contains(word_hash):
counts[word_hash] += 1
elif counts.len < counters:
counts[word_hash] = 1
else:
var keys_to_delete = newSeq[Hash]()
for key, value in counts:
if value == 1:
keys_to_delete.add(key)
else:
counts[key] -= 1
for key in keys_to_delete:
counts.del(key)
return counts
# proc printTop(table: TableRef[string, int], top: int): void =
# var sorted = newCountTable[string]()
# for key in table.keys:
# sorted[key] = table[key]
# sorted.sort
# var n = 0
# for pair in sorted.pairs:
# inc n
# if n > top: break
# let (key, value) = pair
# echo $n, ": '", key, "' -> ", value
let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0
#res.printTop(30)
With that version in my file: enwik8 drops to 53s vs the python 64s. But I can't see the results with that version. And if I save the string slice I will be back to the same problem.
The python version:
from timeit import default_timer as timer
WORD_SIZE = 12
K = 10000
def window(line, size):
for i in range(len(line) - size + 1):
yield line[i : i + size]
def counter(file, size, k):
lines = ""
for line in open(file):
lines += line
counts = {}
for word in window(lines, size):
if word in counts:
counts[word] += 1
elif len(counts) < k:
counts[word] = 1
else:
to_remove = []
for i in counts:
if counts[i] == 1:
to_remove.append(i)
else:
counts[i] -= 1
for r in to_remove:
del counts[r]
return counts
def printTop(table, top):
sorted_keys = sorted(table, key = table.__getitem__, reverse = True)
n = 0
for key in sorted_keys:
n += 1
if n > top: break
escaped_key = key.replace('\n', '\\n')
print("{}: '{}' -> {}".format(n, escaped_key, table[key]))
t0 = timer()
res = counter("enwik8", WORD_SIZE, K)
print("CPU time [s] ", timer() - t0)
printTop(res, 30)
Isn't the following piece of code does the same lookup in table three times:
if counts.contains(word_hash):
counts[word_hash] += 1
elif counts.len < counters:
counts[word_hash] = 1
else:
You can do a single lookup by leveraging withValue
https://nim-lang.org/docs/tables.html#withValue.t,Table%5BA,B%5D,A,untyped,untyped,untyped
and
Of course the swarm of string copies needs to be eliminated if you care about performance.
V2:
import tables, times, hashes
const WORD_SIZE = 4
const K = 10000
iterator wordSlices(line: string, size: int): Slice[int] =
for startIndex in 0 .. len(line) - size:
let endIndex = startIndex + size - 1
yield startIndex..endIndex
proc counter(file: string, wordSize: int, counters: int): TableRef[string, int] =
var lines = ""
for line in lines(file):
lines.add(line)
echo "ready"
var
hashToWordMap = newTable[Hash, string]()
wordCountsMap = newTable[Hash, int]()
for wordSlice in wordSlices(lines, wordSize):
let wordHash = hash(lines, wordSlice.a, wordSlice.b)
if wordHash notin hashToWordMap:
hashToWordMap[wordHash] = lines[wordSlice]
if wordHash in wordCountsMap:
wordCountsMap[wordHash] += 1
elif len(wordCountsMap) < counters:
wordCountsMap[wordHash] = 1
else:
var keysToDelete = newSeq[Hash]()
for key, value in wordCountsMap:
if value == 1:
keysToDelete.add(key)
else:
wordCountsMap[key] -= 1
for key in keysToDelete:
wordCountsMap.del(key)
result = newTable[string, int]()
for key, value in wordCountsMap:
result[hashToWordMap[key]] = value
proc printTop(table: TableRef[string, int], top: int): void =
var sorted = newCountTable[string]()
for key in table.keys:
sorted[key] = table[key]
sorted.sort
var n = 0
for pair in sorted.pairs:
inc n
if n > top: break
let (key, value) = pair
echo $n, ": '", key, "' -> ", value
let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0
res.printTop(30)