This short-string-optimized, copy-on-write string implementation seems to be very good:
const
AlwaysAvail = 7
PayloadSize = AlwaysAvail + sizeof(pointer)
proc atomicAddFetch(p: var int; v: int): int {.importc: "__sync_add_and_fetch", nodecl.}
proc atomicSubFetch(p: var int; v: int): int {.importc: "__sync_sub_and_fetch", nodecl.}
type
LongString = object
rc: int # atomic reference count; 1 = unique owner
fullLen: int
capImpl: int # bit 0: heap-allocated; upper bits: capacity (cap = capImpl shr 1)
data: UncheckedArray[char]
SmallString = object
slen: byte # when > PayloadSize, `more` is valid ptr
payload: array[AlwaysAvail, char]
more: ptr LongString # when long: pointer; when small (len 8..15): bytes 7..14 stored here
proc `=destroy`*(s: var SmallString) =
if int(s.slen) > PayloadSize and (s.more.capImpl and 1) == 1:
if atomicSubFetch(s.more.rc, 1) == 0:
dealloc(s.more)
proc `=wasMoved`*(s: var SmallString) {.inline.} =
s.slen = 0
proc `=sink`*(dst: var SmallString; src: SmallString) =
`=destroy`(dst)
copyMem(addr dst, unsafeAddr src, sizeof(SmallString))
proc `=copy`*(dst: var SmallString; src: SmallString) =
if int(src.slen) <= PayloadSize:
`=destroy`(dst) # dst may have been a long string
copyMem(addr dst, unsafeAddr src, sizeof(SmallString))
else:
if addr(dst) == unsafeAddr(src): return
`=destroy`(dst)
# COW: share the block, bump refcount — no allocation needed
if (src.more.capImpl and 1) == 1:
discard atomicAddFetch(src.more.rc, 1)
copyMem(addr dst, unsafeAddr src, sizeof(SmallString))
proc `=dup`*(src: SmallString): SmallString =
copyMem(addr result, unsafeAddr src, sizeof(SmallString))
if int(src.slen) > PayloadSize and (src.more.capImpl and 1) == 1:
discard atomicAddFetch(src.more.rc, 1)
proc ensureUniqueLong(s: var SmallString; oldLen, newLen: int) =
# Ensure s.more is a unique (rc=1) heap block with capacity >= newLen, preserving existing data.
# s must already be a long string on entry.
let heapAlloc = (s.more.capImpl and 1) == 1
let unique = heapAlloc and s.more.rc == 1
let cap = s.more.capImpl shr 1
if unique and newLen <= cap:
s.more.fullLen = newLen
else:
let newCap = max(newLen, oldLen * 2)
let p = cast[ptr LongString](alloc(sizeof(int) * 3 + newCap + 1))
p.rc = 1
p.fullLen = newLen
p.capImpl = (newCap shl 1) or 1
let old = s.more
copyMem(addr p.data[0], addr old.data[0], oldLen)
if heapAlloc and atomicSubFetch(old.rc, 1) == 0:
dealloc(old)
s.more = p
proc len*(s: SmallString): int {.inline.} =
result = int s.slen
if result > PayloadSize:
result = s.more.fullLen
template guts(s: SmallString): (int, ptr UncheckedArray[char]) =
let slen = int s.slen
if slen > PayloadSize:
(s.more.fullLen, cast[ptr UncheckedArray[char]](addr s.more.data[0]))
else:
(slen, cast[ptr UncheckedArray[char]](addr s.payload[0]))
proc `[]`*(s: SmallString; i: int): char {.inline.} =
let (l, p) = s.guts
assert i < l
result = p[i]
proc `[]=`*(s: var SmallString; i: int; c: char) =
let slen = int s.slen
if slen <= PayloadSize:
# unchecked: when i >= 7 we store into the `more` overlay
(cast[ptr UncheckedArray[char]](addr s.payload[0]))[i] = c
else:
let l = s.more.fullLen
ensureUniqueLong(s, l, l) # COW if shared; length unchanged
s.more.data[i] = c
if i < AlwaysAvail:
s.payload[i] = c
proc cmp*(a, b: SmallString): int =
# Use slen directly for prefix length: for short/medium it is the real length,
# for long it is the sentinel (> AlwaysAvail), so min(..., AlwaysAvail) still gives 7.
# This avoids dereferencing `more` before the prefix comparison.
let pfxLen = min(min(int a.slen, int b.slen), AlwaysAvail)
result = cmpMem(unsafeAddr a.payload[0], unsafeAddr b.payload[0], pfxLen)
if result != 0: return
# Prefix matched — now fetch actual lengths (dereferences `more` only if long)
let la = if int(a.slen) > PayloadSize: a.more.fullLen else: int(a.slen)
let lb = if int(b.slen) > PayloadSize: b.more.fullLen else: int(b.slen)
let minLen = min(la, lb)
if minLen <= AlwaysAvail:
result = la - lb
return
let (_, pa) = a.guts
let (_, pb) = b.guts
result = cmpMem(addr pa[AlwaysAvail], addr pb[AlwaysAvail], minLen - AlwaysAvail)
if result == 0:
result = la - lb
proc `==`*(a, b: SmallString): bool =
if a.slen != b.slen: return false
# slen equal: for short/medium this means equal lengths; for long (both sentinel) we still need fullLen.
let slen = int(a.slen)
let pfxLen = min(slen, AlwaysAvail)
if cmpMem(unsafeAddr a.payload[0], unsafeAddr b.payload[0], pfxLen) != 0: return false
if slen <= AlwaysAvail: return true
if slen <= PayloadSize:
# medium: guts gives the UncheckedArray without a heap dereference
let (la, pa) = a.guts
let (_, pb) = b.guts
return cmpMem(addr pa[pfxLen], addr pb[pfxLen], la - pfxLen) == 0
# long: fetch actual lengths only after prefix matched
let la = a.more.fullLen
if la != b.more.fullLen: return false
cmpMem(addr a.more.data[pfxLen], addr b.more.data[pfxLen], la - pfxLen) == 0
proc `<=`*(a, b: SmallString): bool {.inline.} = cmp(a, b) <= 0
proc continuesWith*(s, sub: SmallString; start: int): bool =
let subLen = sub.len
if start < 0 or start + subLen > s.len: return false
if subLen == 0: return true
# How many bytes starting at `start` are in s's inline payload?
let pfxLen = min(subLen, max(0, AlwaysAvail - start))
if pfxLen > 0:
# sub.payload always holds sub's first AlwaysAvail bytes, so pfxLen <= AlwaysAvail is fine
if cmpMem(unsafeAddr s.payload[start], unsafeAddr sub.payload[0], pfxLen) != 0:
return false
if pfxLen == subLen: return true
# Prefix matched or start >= AlwaysAvail; fall back to full data
let (_, sp) = s.guts
let (_, subp) = sub.guts
cmpMem(addr sp[start + pfxLen], addr subp[pfxLen], subLen - pfxLen) == 0
proc startsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, 0)
proc endsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, s.len - sub.len)
proc add*(s: var SmallString; c: char) =
let slen = int(s.slen)
if slen <= PayloadSize:
let newLen = slen + 1
if newLen <= PayloadSize:
cast[ptr UncheckedArray[char]](addr s.payload[0])[slen] = c
s.slen = byte(newLen)
else:
# transition from medium (slen == PayloadSize) to long
let cap = newLen * 2
let p = cast[ptr LongString](alloc(sizeof(int) * 3 + cap + 1))
p.rc = 1
p.fullLen = newLen
p.capImpl = (cap shl 1) or 1
copyMem(addr p.data[0], cast[ptr UncheckedArray[char]](addr s.payload[0]), slen)
p.data[slen] = c
# payload[0..AlwaysAvail-1] already correct; slen >= AlwaysAvail so no update needed
s.more = p
s.slen = byte(PayloadSize + 1)
else:
let l = s.more.fullLen # fetch fullLen only in the long path
ensureUniqueLong(s, l, l + 1)
s.more.data[l] = c
# l >= PayloadSize > AlwaysAvail, so prefix is unaffected
proc add*(s: var SmallString; t: SmallString) =
let slen = int(s.slen)
let (tl, tp) = t.guts # fetch t's guts before any mutation (aliasing safety)
if tl == 0: return
if slen <= PayloadSize:
let sl = slen # for short/medium, slen IS the actual length
let newLen = sl + tl
if newLen <= PayloadSize:
copyMem(addr cast[ptr UncheckedArray[char]](addr s.payload[0])[sl], tp, tl)
s.slen = byte(newLen)
else:
# transition to long
let cap = newLen * 2
let p = cast[ptr LongString](alloc(sizeof(int) * 3 + cap + 1))
p.rc = 1
p.fullLen = newLen
p.capImpl = (cap shl 1) or 1
copyMem(addr p.data[0], cast[ptr UncheckedArray[char]](addr s.payload[0]), sl)
copyMem(addr p.data[sl], tp, tl)
# update prefix bytes that come from t (only when sl < AlwaysAvail)
if sl < AlwaysAvail:
copyMem(addr s.payload[sl], tp, min(AlwaysAvail - sl, tl))
s.more = p
s.slen = byte(PayloadSize + 1)
else:
let sl = s.more.fullLen # fetch fullLen only in the long path
let newLen = sl + tl
# tp was read before ensureUniqueLong: if t.more == s.more, rc decrements but won't hit 0
ensureUniqueLong(s, sl, newLen)
copyMem(addr s.more.data[sl], tp, tl)
# sl >= PayloadSize > AlwaysAvail, so prefix is unaffected
proc `&`*(a, b: SmallString): SmallString =
result = a
result.add(b)
proc toSmallString*(s: openArray[char]): SmallString =
let l = s.len
if l == 0: return
if l <= PayloadSize:
result.slen = byte(l)
copyMem(cast[ptr UncheckedArray[char]](addr result.payload[0]), unsafeAddr s[0], l)
else:
let p = cast[ptr LongString](alloc(sizeof(int) * 3 + l + 1))
p.rc = 1
p.fullLen = l
p.capImpl = (l shl 1) or 1
copyMem(addr p.data[0], unsafeAddr s[0], l)
copyMem(addr result.payload[0], unsafeAddr s[0], AlwaysAvail)
result.slen = byte(PayloadSize + 1)
result.more = p
proc toSmallString*(s: string): SmallString =
toSmallString(s.toOpenArray(0, s.high))
proc `$`*(s: SmallString): string =
let (l, p) = s.guts
result = newString(l)
if l > 0:
copyMem(unsafeAddr result[0], p, l)
when isMainModule:
var s = toSmallString("Hello, world!")
s.add toSmallString("more")
echo s
It's complete enough for you to tinker with but the real value would be in patching Nim 2 (or Nimony) to use this implementation for ARC/ORC.
Properties:
I just feed your post and I got this: https://github.com/mantielero/sso_challenge
I am not sure if this is useful.
The AI said:
Key Features Implemented
- 16-byte size - fits in two CPU registers
- Short strings (≤15 bytes) - stored inline, zero heap allocations
- Long strings - copy-on-write with atomic reference counting
- 7-byte inline prefix - fast comparison without memory fetches
- Full API: creation, mutation, comparison, indexing, iteration
Test Results
✅ All 30+ unit tests pass covering:
- Basics, mutation, comparison, substring ops
- COW semantics, iteration, utilities, edge cases
Benchmark Highlights
┌──────────────┬────────────────────────────────────┐
│ Operation │ SmallString Performance │
├──────────────┼────────────────────────────────────┤
│ Short concat │ 16 ns/iter (2.7x faster than std) │
│ COW+modify │ 50 ns/iter (competitive) │
│ Long cmp │ 0.0006 ns/iter (fast prefix check) │
└──────────────┴────────────────────────────────────┘
Learn how to read. Let me quote myself:
"Long strings use copy-on-write with atomics, these should be rare enough that the sharing of memory pays off and the atomics don't matter. This needs benchmarking though, maybe it's overkill and a pessimization."
I just feed your post and I got this: https://github.com/mantielero/sso_challenge
Nice but the real challenge is in patching the compiler to use this new string implementation.
does it also need a lock for more pointer changes? looks like two process can at the same time get into ensureUniqueLong and one will rewrite another
or if it is under programmers concerns to have a lock then why it needs atomicFetches?
inline limit=14 bytes count=200000 rounds=5 seed=20260307
MAXMEM=37.656MiB
inline limit=14 bytes count=200000 rounds=5 seed=20260307
MAXMEM=92.027MiB
50% memory savings are nothing to complain about though.
These numbers were collected on a Ryzen-class Linux machine by running the same benchmark programs with the default string implementation and again with -d:nimsso. The current SSO implementation still loses on raw cmp, but it now wins clearly on the higher-level sortbench and csvbench workloads, and it is mostly favorable on hashbench as well.
| Benchmark | Scenario | Baseline | -d:nimsso | Change |
|---|---|---|---|---|
| cmpbench | short | 3.4 ns/cmp | 4.5 ns/cmp | 1.32x slower |
| cmpbench | inline | 3.5 ns/cmp | 7.0 ns/cmp | 2.00x slower |
| cmpbench | boundary | 3.5 ns/cmp | 10.0 ns/cmp | 2.86x slower |
| cmpbench | long | 9.3 ns/cmp | 16.5 ns/cmp | 1.77x slower |
| cmpbench | mixed | 5.3 ns/cmp | 12.1 ns/cmp | 2.28x slower |
| sortbench | short | 67.239 ms | 22.287 ms | 3.02x faster |
| sortbench | inline | 91.306 ms | 30.799 ms | 2.96x faster |
| sortbench | boundary | 71.634 ms | 33.444 ms | 2.14x faster |
| sortbench | long | 121.103 ms | 26.700 ms | 4.54x faster |
| sortbench | prefix | 96.590 ms | 33.342 ms | 2.90x faster |
| hashbench | short | 60.8 ns/op | 42.7 ns/op | 1.42x faster |
| hashbench | inline | 82.0 ns/op | 115.1 ns/op | 1.40x slower |
| hashbench | boundary | 83.9 ns/op | 78.7 ns/op | 1.07x faster |
| hashbench | long | 98.4 ns/op | 68.1 ns/op | 1.45x faster |
| hashbench | prefix | 99.1 ns/op | 60.2 ns/op | 1.65x faster |
| hashbench | mixed | 80.4 ns/op | 67.2 ns/op | 1.20x faster |
| csvbench | rows=100000 | 41.514 ms | 31.992 ms | 1.30x faster |
In short: cmpbench shows that raw string comparison is still more expensive with SSO, but the end-to-end results are now quite encouraging. The strongest wins show up in sorting and CSV parsing, which suggests that reduced allocation pressure and better locality are starting to outweigh the remaining compare overhead in practical workloads.
Will publish the benchmarks later.
Implementation is at https://github.com/nim-lang/Nim/pull/25593
Nice but the real challenge is in patching the compiler to use this new string implementation
Is it a good fit for the compiler? I though having an identifier table and only passing around something like integer IDs is the way. Comparison is trivial, lookup is also fast.
SSO: 191035 lines; 7.502s; 783.945MiB peakmem;
default: 190538 lines; 7.541s; 800.758MiB peakmem;
Saves 16MB for the compiler itself.