nimforum mirror - String slice performance!

shumy (orginal) [2017-07-20T16:55:47+02:00] view original

Hello, I was testing a simple stream algorithm with a slice window, but performance is a bit disappointing! A python 3 version is actually in average 2x faster than the nim version.

My code:

import tables, times

const WORD_SIZE = 4
const K = 10000

iterator window(line: string, size: int): string =
  for i in 0 .. line.len - size:
    yield line[i .. i + size - 1]

proc counter(file: string, wordSize: int, counters: int): TableRef[string, int] =
  var lines = ""
  for line in lines(file):
    lines.add(line)
  echo "ready"
  
  var counts = newTable[string, int]()
  for word in window(lines, wordSize):
    if counts.contains(word):
      counts[word] += 1
    elif counts.len < counters:
      counts[word] = 1
    else:
      for pair in counts.pairs:
        let (key, value) = pair
        if value == 1:
          counts.del(key)
        else:
          counts[key] -= 1
  
  return counts

proc printTop(table: TableRef[string, int], top: int): void =
  var sorted = newCountTable[string]()
  for key in table.keys:
    sorted[key] = table[key]
  sorted.sort
  
  var n = 0
  for pair in sorted.pairs:
    inc n
    if n > top: break
    let (key, value) = pair
    echo $n, ": '", key, "' -> ", value

let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0

res.printTop(30)

I use this to process a 100MB text file. I'm betting the problem is on the string slice on the window iterator, probably constructing a string copy instead of reusing the reference! How can I improve this?

Also, why there isn't a del procedure in the CountTable type?

Krux02 (orginal) [2017-07-20T18:51:10+02:00] view original

well you can create your own StringSlice type that does not copy and override the slicing:

type
  StringSlice
    src: ptr string
    a,b: int

proc `$`(arg: StringSlice): string =
  (arg.src[])[arg.a .. arg.b]

shumy (orginal) [2017-07-20T19:04:32+02:00] view original

Is that correct?

type
  StringSlice
    src: ptr string
    a,b: int

Gives me a compilation error! Is it instead:

type
  StringSlice = object
    src: ptr string
    a,b: int

Also how can I use this with the type Table:

var t = newTable[StringSlice, int]()

Also gives me compilation error: type mismatch: got (StringSlice)

cdome (orginal) [2017-07-20T19:16:19+02:00] view original

For tables to work you need to define a hash function for your custom type:

proc hash(s: StringSlice): Hash

https://nim-lang.org/docs/hashes.html for details

cdome (orginal) [2017-07-20T19:18:03+02:00] view original

It also feels like CountTable is better choice for your problem. https://nim-lang.org/docs/tables.html#CountTable

Varriount (orginal) [2017-07-20T20:03:15+02:00] view original

Could you post your Python source code too? Often when people post these kinds of comparisons, their programs are doing slightly different things.

Varriount (orginal) [2017-07-21T10:43:39+02:00] view original

I've come up with an optimized version which takes about 5 seconds on my machine, vs the 48 seconds that the Python script takes. This version doesn't print out the top words (that would take a bit more work, and another map of hashes to strings)

import tables, times, hashes

const WORD_SIZE = 4
const K = 10000

iterator window(line: string, size: int): Hash =
  for start_pos in 0 .. line.len - size:
    let end_pos = start_pos + size - 1
    yield hash(line, start_pos, end_pos)

proc counter(file: string, wordSize: int, counters: int): TableRef[Hash, int] =
  var lines = ""
  for line in lines(file):
    lines.add(line)
  echo "ready"
  
  var counts = newTable[Hash, int]()
  for word_hash in window(lines, wordSize):
    if counts.contains(word_hash):
      counts[word_hash] += 1
    elif counts.len < counters:
      counts[word_hash] = 1
    else:
      var keys_to_delete = newSeq[Hash]()
      for key, value in counts:
        if value == 1:
          keys_to_delete.add(key)
        else:
          counts[key] -= 1
      for key in keys_to_delete:
        counts.del(key)
  
  return counts

# proc printTop(table: TableRef[string, int], top: int): void =
#   var sorted = newCountTable[string]()
#   for key in table.keys:
#     sorted[key] = table[key]
#   sorted.sort

#   var n = 0
#   for pair in sorted.pairs:
#     inc n
#     if n > top: break
#     let (key, value) = pair
#     echo $n, ": '", key, "' -> ", value

let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0

#res.printTop(30)

shumy (orginal) [2017-07-21T12:07:14+02:00] view original

With that version in my file: enwik8 drops to 53s vs the python 64s. But I can't see the results with that version. And if I save the string slice I will be back to the same problem.

The python version:

from timeit import default_timer as timer

WORD_SIZE = 12
K = 10000

def window(line, size):
  for i in range(len(line) - size + 1):
    yield line[i : i + size]

def counter(file, size, k):
  lines = ""
  for line in open(file):
    lines += line
  
  counts = {}
  for word in window(lines, size):
    if word in counts:
      counts[word] += 1
    elif len(counts) < k:
      counts[word] = 1
    else:
      to_remove = []
      for i in counts:
        if counts[i] == 1:
          to_remove.append(i)
        else:
          counts[i] -= 1
      
      for r in to_remove:
        del counts[r]
  
  return counts

def printTop(table, top):
  sorted_keys = sorted(table, key = table.__getitem__, reverse = True)
  
  n = 0
  for key in sorted_keys:
    n += 1
    if n > top: break
    escaped_key = key.replace('\n', '\\n')
    print("{}: '{}' -> {}".format(n, escaped_key, table[key]))

t0 = timer()
res = counter("enwik8", WORD_SIZE, K)
print("CPU time [s] ", timer() - t0)

printTop(res, 30)

shumy (orginal) [2017-07-21T12:09:54+02:00] view original

To "cdome": I didn't find any delete proc in the CountTable, that's why I didn't use it.

cdome (orginal) [2017-07-21T14:58:09+02:00] view original

Isn't the following piece of code does the same lookup in table three times:

if counts.contains(word_hash):
  counts[word_hash] += 1
elif counts.len < counters:
  counts[word_hash] = 1
else:

You can do a single lookup by leveraging withValue

https://nim-lang.org/docs/tables.html#withValue.t,Table%5BA,B%5D,A,untyped,untyped,untyped

and

Of course the swarm of string copies needs to be eliminated if you care about performance.

shumy (orginal) [2017-07-21T15:28:48+02:00] view original

"cdome", actually just 2 times since only selects one branch of the condition. But thanks for the tip, because it dropped to 38s just by changing the conditions order.

Jehan (orginal) [2017-07-21T16:36:39+02:00] view original

I suspect that you are compiling the Nim code without optimization, hence why it appears to be slow. If I compile it with -d:release, it beats the Python script by a factor of 3. You may not want to use -d:release, since that also turns off all runtime checks, but just using --opt:speed --stacktrace:off will get you almost all of the performance, you just won't get nice stack traces if an error happens.

Varriount (orginal) [2017-07-21T16:39:05+02:00] view original

V2:

import tables, times, hashes

const WORD_SIZE = 4
const K = 10000

iterator wordSlices(line: string, size: int): Slice[int] =
  for startIndex in 0 .. len(line) - size:
    let endIndex = startIndex + size - 1
    yield startIndex..endIndex

proc counter(file: string, wordSize: int, counters: int): TableRef[string, int] =
  var lines = ""
  for line in lines(file):
    lines.add(line)
  echo "ready"
  
  var
    hashToWordMap = newTable[Hash, string]()
    wordCountsMap = newTable[Hash, int]()
  
  for wordSlice in wordSlices(lines, wordSize):
    let wordHash = hash(lines, wordSlice.a, wordSlice.b)
    
    if wordHash notin hashToWordMap:
      hashToWordMap[wordHash] = lines[wordSlice]
    
    if wordHash in wordCountsMap:
      wordCountsMap[wordHash] += 1
    elif len(wordCountsMap) < counters:
      wordCountsMap[wordHash] = 1
    else:
      var keysToDelete = newSeq[Hash]()
      for key, value in wordCountsMap:
        if value == 1:
          keysToDelete.add(key)
        else:
          wordCountsMap[key] -= 1
      for key in keysToDelete:
        wordCountsMap.del(key)
  
  result = newTable[string, int]()
  for key, value in wordCountsMap:
    result[hashToWordMap[key]] = value

proc printTop(table: TableRef[string, int], top: int): void =
  var sorted = newCountTable[string]()
  for key in table.keys:
    sorted[key] = table[key]
  sorted.sort
  
  var n = 0
  for pair in sorted.pairs:
    inc n
    if n > top: break
    let (key, value) = pair
    echo $n, ": '", key, "' -> ", value

let t0 = cpuTime()
let res = counter("enwik8", WORD_SIZE, K)
echo "CPU time [s] ", cpuTime() - t0

res.printTop(30)

Mirror of forum.nim-lang.org

3062 :: String slice performance!