A friend was very excited about c++ string_view, which is now available since c++17, and they shared this demo with me where compiling without string_view will yield a runtime of several seconds, and with string_view the runtime is 30-60ms. We often chat about nim so I offered to code up the nim equivalent. I was hoping to use {.experimental: "views".} but couldn't see how to do it, so instead made my own string_view type. To me, this means the answer for doing it in nim is "you have to roll your own." But maybe you can find a way to use views to do this? And I'll learn something :p
Below is the original c++ version followed by my nim version with a shoddy string_view implementation. The demo is just a text corpus to CountTable looking for particular sentences. The demo originally used the Project Gutenberg Encyclopedia, but any suitable corpus should work.
For c++ and nim demos below, compile with or without string_view defined to toggle support.
Caveat: Obviously, you could rewrite the algorithm, but the goal here is to make as few changes as possible and retain the original reading of the algorithm while achieving speedup using views.
#include<iostream>
#include<fstream>
#include<vector>
#include<string>
#include<string_view>
#include<map>
#include<chrono>
using namespace std; // sorry, I find this more readable
int main() {
string line,Book;
ifstream f("pg200.txt");
while(getline(f,line))
Book += line + " ";
f.close();
#ifdef stringview
auto book = string_view(Book);
vector<string_view> sentences;
map<string_view,long> counts;
#else
auto book = Book;
vector<string> sentences;
map<string,long> counts;
#endif
auto start = chrono::system_clock::now();
auto i = book.find("The ");
while(i!=string::npos) {
auto p = book.find_first_of('.',i);
sentences.push_back(book.substr(i,p-i+1));
book = book.substr(p);
i = book.find("The ");
}
for (auto &se : sentences) {
auto s = se;
auto i = s.find(" ");
while(i!=string::npos) {
counts[s.substr(0,i)]++;
s = s.substr(i+1);
i = s.find(" ");
}
}
auto end = chrono::system_clock::now();
cout << "processing took " << chrono::duration_cast<chrono::milliseconds>(end-start).count() << " ms" << endl;
}
import times
import tables
import strutils
import hashes
## my good-enough incomplete string_view
type
string_view = object
src: ptr string
a,b: int
hsh: Hash
proc initString_view(s:var string):string_view =
result.src = addr s
result.a = 0
result.b = s.len - 1
result.hsh = hash(s)
proc hash(sv:string_view):Hash =
result = sv.hsh !& hash(sv.a) !& hash(sv.b)
result = !$ result
template `$`(sv:string_view):untyped =
sv.src[][sv.a..sv.b]
template substr(sv:string_view, start,last:int):untyped =
string_view(src:sv.src,a:sv.a+start,b:sv.a+last)
template substr(sv:string_view, start:int):untyped =
string_view(src:sv.src,a:sv.a+start,b:sv.b)
template find(sv:string_view, sub:char|string, start:Natural=0):untyped =
sv.src[].find(sub, start+sv.a, sv.b)-sv.a
#template find(sv:string_view, sub:char|string, start:Natural, last:Natural):untyped =
# sv.src[].find(sub, start+sv.a, last+sv.a)-sv.a
proc main() =
var bookBuilder = ""
for line in lines "pg200.txt":
bookBuilder.add line & " "
when defined(stringview):
var sentences = newSeq[string_view]()
var counts = newCountTable[string_view]()
var book = initString_view(bookBuilder)
else:
var sentences = newSeq[string]()
var counts = newCountTable[string]()
var book = bookBuilder
let start = cpuTime()
var i = book.find("The ")
while i >= 0:
var p = book.find('.', start=i)
sentences.add book.substr(i,p)
book = book.substr(p)
i = book.find("The ")
for s in sentences:
var frag = s
var i = frag.find(" ")
while i >= 0:
counts.inc(frag.substr(0,i))
frag = frag.substr(i+1)
i = frag.find(" ")
let stop = cpuTime()
echo "processing took ", (stop-start)*1_000, " ms"
when isMainModule: main()
I'm not focusing on performance, but out of curiosity I've adapted your code to use Nim's own views.
Sadly they're still not ready for prime-time yet, as we can see:
import std/[times, tables]
{.experimental:"strictFuncs".}
{.experimental:"views".}
proc substr(sv: openArray[char]; start, last: int): openArray[char] =
sv.toOpenArray(start, last)
proc substr(sv: openArray[char]; start: int): openArray[char] =
sv.toOpenArray(start, sv.high - start)
# butchered versions of `find` from the standard library to work with `openArray`
func find*(s: openArray[char], sub: char, start: Natural = 0, last = 0): int =
let last = if last == 0: s.high else: last
for i in int(start)..last:
if sub == s[i]: return i
return -1
func c_strstr(haystack, needle: cstring): cstring {.importc: "strstr", header: "<string.h>".}
func find(s, sub: openArray[char]; start: Natural = 0): int =
if sub.len > s.len - start: return -1
if sub.len == 1: return find(s, sub[0], start)
let found = c_strstr(s[start].unsafeAddr, sub[0].unsafeAddr)
if not found.isNil:
cast[ByteAddress](found) -% cast[ByteAddress](s[0].unsafeAddr.cstring)
else:
-1
proc main() =
var bookBuilder = ""
for line in lines "pg200.txt":
bookBuilder.add line & " "
var sentences = newSeq[openArray[char]]()
var counts = newCountTable[openArray[char]]()
var book = bookBuilder.toOpenArray(0, bookBuilder.len-1)
let start = cpuTime()
var i = book.find("The ")
while i >= 0:
var p = book.find('.', start=i)
sentences.add book.substr(i,p)
book = book.substr(p)
i = book.find("The ")
for s in sentences:
var frag = s
var i = frag.find(" ")
while i >= 0:
counts.inc(frag.substr(0,i))
frag = frag.substr(i+1)
i = frag.find(" ")
let stop = cpuTime()
echo "processing took ", (stop-start)*1_000, " ms"
let tmp = bookBuilder # fix the lifetime for `bookBuilder`
when isMainModule: main()
Ah well, I'm really looking forward to when views become stable anyways, they're one of my most anticipated features of the language :D