nimforum mirror - Vibe coding challenge: SSO for Nim 2

Araq (orginal) [2026-03-06T20:59:47+01:00] view original

This short-string-optimized, copy-on-write string implementation seems to be very good:


const
  AlwaysAvail = 7
  PayloadSize = AlwaysAvail + sizeof(pointer)

proc atomicAddFetch(p: var int; v: int): int {.importc: "__sync_add_and_fetch", nodecl.}
proc atomicSubFetch(p: var int; v: int): int {.importc: "__sync_sub_and_fetch", nodecl.}

type
  LongString = object
    rc: int       # atomic reference count; 1 = unique owner
    fullLen: int
    capImpl: int  # bit 0: heap-allocated; upper bits: capacity (cap = capImpl shr 1)
    data: UncheckedArray[char]
  
  SmallString = object
    slen: byte   # when > PayloadSize, `more` is valid ptr
    payload: array[AlwaysAvail, char]
    more: ptr LongString  # when long: pointer; when small (len 8..15): bytes 7..14 stored here

proc `=destroy`*(s: var SmallString) =
  if int(s.slen) > PayloadSize and (s.more.capImpl and 1) == 1:
    if atomicSubFetch(s.more.rc, 1) == 0:
      dealloc(s.more)

proc `=wasMoved`*(s: var SmallString) {.inline.} =
  s.slen = 0

proc `=sink`*(dst: var SmallString; src: SmallString) =
  `=destroy`(dst)
  copyMem(addr dst, unsafeAddr src, sizeof(SmallString))

proc `=copy`*(dst: var SmallString; src: SmallString) =
  if int(src.slen) <= PayloadSize:
    `=destroy`(dst)  # dst may have been a long string
    copyMem(addr dst, unsafeAddr src, sizeof(SmallString))
  else:
    if addr(dst) == unsafeAddr(src): return
    `=destroy`(dst)
    # COW: share the block, bump refcount — no allocation needed
    if (src.more.capImpl and 1) == 1:
      discard atomicAddFetch(src.more.rc, 1)
    copyMem(addr dst, unsafeAddr src, sizeof(SmallString))

proc `=dup`*(src: SmallString): SmallString =
  copyMem(addr result, unsafeAddr src, sizeof(SmallString))
  if int(src.slen) > PayloadSize and (src.more.capImpl and 1) == 1:
    discard atomicAddFetch(src.more.rc, 1)

proc ensureUniqueLong(s: var SmallString; oldLen, newLen: int) =
  # Ensure s.more is a unique (rc=1) heap block with capacity >= newLen, preserving existing data.
  # s must already be a long string on entry.
  let heapAlloc = (s.more.capImpl and 1) == 1
  let unique = heapAlloc and s.more.rc == 1
  let cap = s.more.capImpl shr 1
  if unique and newLen <= cap:
    s.more.fullLen = newLen
  else:
    let newCap = max(newLen, oldLen * 2)
    let p = cast[ptr LongString](alloc(sizeof(int) * 3 + newCap + 1))
    p.rc = 1
    p.fullLen = newLen
    p.capImpl = (newCap shl 1) or 1
    let old = s.more
    copyMem(addr p.data[0], addr old.data[0], oldLen)
    if heapAlloc and atomicSubFetch(old.rc, 1) == 0:
      dealloc(old)
    s.more = p

proc len*(s: SmallString): int {.inline.} =
  result = int s.slen
  if result > PayloadSize:
    result = s.more.fullLen

template guts(s: SmallString): (int, ptr UncheckedArray[char]) =
  let slen = int s.slen
  if slen > PayloadSize:
    (s.more.fullLen, cast[ptr UncheckedArray[char]](addr s.more.data[0]))
  else:
    (slen, cast[ptr UncheckedArray[char]](addr s.payload[0]))

proc `[]`*(s: SmallString; i: int): char {.inline.} =
  let (l, p) = s.guts
  assert i < l
  result = p[i]

proc `[]=`*(s: var SmallString; i: int; c: char) =
  let slen = int s.slen
  if slen <= PayloadSize:
    # unchecked: when i >= 7 we store into the `more` overlay
    (cast[ptr UncheckedArray[char]](addr s.payload[0]))[i] = c
  else:
    let l = s.more.fullLen
    ensureUniqueLong(s, l, l)  # COW if shared; length unchanged
    s.more.data[i] = c
    if i < AlwaysAvail:
      s.payload[i] = c

proc cmp*(a, b: SmallString): int =
  # Use slen directly for prefix length: for short/medium it is the real length,
  # for long it is the sentinel (> AlwaysAvail), so min(..., AlwaysAvail) still gives 7.
  # This avoids dereferencing `more` before the prefix comparison.
  let pfxLen = min(min(int a.slen, int b.slen), AlwaysAvail)
  result = cmpMem(unsafeAddr a.payload[0], unsafeAddr b.payload[0], pfxLen)
  if result != 0: return
  # Prefix matched — now fetch actual lengths (dereferences `more` only if long)
  let la = if int(a.slen) > PayloadSize: a.more.fullLen else: int(a.slen)
  let lb = if int(b.slen) > PayloadSize: b.more.fullLen else: int(b.slen)
  let minLen = min(la, lb)
  if minLen <= AlwaysAvail:
    result = la - lb
    return
  let (_, pa) = a.guts
  let (_, pb) = b.guts
  result = cmpMem(addr pa[AlwaysAvail], addr pb[AlwaysAvail], minLen - AlwaysAvail)
  if result == 0:
    result = la - lb

proc `==`*(a, b: SmallString): bool =
  if a.slen != b.slen: return false
  # slen equal: for short/medium this means equal lengths; for long (both sentinel) we still need fullLen.
  let slen = int(a.slen)
  let pfxLen = min(slen, AlwaysAvail)
  if cmpMem(unsafeAddr a.payload[0], unsafeAddr b.payload[0], pfxLen) != 0: return false
  if slen <= AlwaysAvail: return true
  if slen <= PayloadSize:
    # medium: guts gives the UncheckedArray without a heap dereference
    let (la, pa) = a.guts
    let (_, pb) = b.guts
    return cmpMem(addr pa[pfxLen], addr pb[pfxLen], la - pfxLen) == 0
  # long: fetch actual lengths only after prefix matched
  let la = a.more.fullLen
  if la != b.more.fullLen: return false
  cmpMem(addr a.more.data[pfxLen], addr b.more.data[pfxLen], la - pfxLen) == 0

proc `<=`*(a, b: SmallString): bool {.inline.} = cmp(a, b) <= 0

proc continuesWith*(s, sub: SmallString; start: int): bool =
  let subLen = sub.len
  if start < 0 or start + subLen > s.len: return false
  if subLen == 0: return true
  # How many bytes starting at `start` are in s's inline payload?
  let pfxLen = min(subLen, max(0, AlwaysAvail - start))
  if pfxLen > 0:
    # sub.payload always holds sub's first AlwaysAvail bytes, so pfxLen <= AlwaysAvail is fine
    if cmpMem(unsafeAddr s.payload[start], unsafeAddr sub.payload[0], pfxLen) != 0:
      return false
    if pfxLen == subLen: return true
  # Prefix matched or start >= AlwaysAvail; fall back to full data
  let (_, sp) = s.guts
  let (_, subp) = sub.guts
  cmpMem(addr sp[start + pfxLen], addr subp[pfxLen], subLen - pfxLen) == 0

proc startsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, 0)
proc endsWith*(s, sub: SmallString): bool {.inline.} = continuesWith(s, sub, s.len - sub.len)


proc add*(s: var SmallString; c: char) =
  let slen = int(s.slen)
  if slen <= PayloadSize:
    let newLen = slen + 1
    if newLen <= PayloadSize:
      cast[ptr UncheckedArray[char]](addr s.payload[0])[slen] = c
      s.slen = byte(newLen)
    else:
      # transition from medium (slen == PayloadSize) to long
      let cap = newLen * 2
      let p = cast[ptr LongString](alloc(sizeof(int) * 3 + cap + 1))
      p.rc = 1
      p.fullLen = newLen
      p.capImpl = (cap shl 1) or 1
      copyMem(addr p.data[0], cast[ptr UncheckedArray[char]](addr s.payload[0]), slen)
      p.data[slen] = c
      # payload[0..AlwaysAvail-1] already correct; slen >= AlwaysAvail so no update needed
      s.more = p
      s.slen = byte(PayloadSize + 1)
  else:
    let l = s.more.fullLen  # fetch fullLen only in the long path
    ensureUniqueLong(s, l, l + 1)
    s.more.data[l] = c
    # l >= PayloadSize > AlwaysAvail, so prefix is unaffected

proc add*(s: var SmallString; t: SmallString) =
  let slen = int(s.slen)
  let (tl, tp) = t.guts  # fetch t's guts before any mutation (aliasing safety)
  if tl == 0: return
  if slen <= PayloadSize:
    let sl = slen  # for short/medium, slen IS the actual length
    let newLen = sl + tl
    if newLen <= PayloadSize:
      copyMem(addr cast[ptr UncheckedArray[char]](addr s.payload[0])[sl], tp, tl)
      s.slen = byte(newLen)
    else:
      # transition to long
      let cap = newLen * 2
      let p = cast[ptr LongString](alloc(sizeof(int) * 3 + cap + 1))
      p.rc = 1
      p.fullLen = newLen
      p.capImpl = (cap shl 1) or 1
      copyMem(addr p.data[0], cast[ptr UncheckedArray[char]](addr s.payload[0]), sl)
      copyMem(addr p.data[sl], tp, tl)
      # update prefix bytes that come from t (only when sl < AlwaysAvail)
      if sl < AlwaysAvail:
        copyMem(addr s.payload[sl], tp, min(AlwaysAvail - sl, tl))
      s.more = p
      s.slen = byte(PayloadSize + 1)
  else:
    let sl = s.more.fullLen  # fetch fullLen only in the long path
    let newLen = sl + tl
    # tp was read before ensureUniqueLong: if t.more == s.more, rc decrements but won't hit 0
    ensureUniqueLong(s, sl, newLen)
    copyMem(addr s.more.data[sl], tp, tl)
    # sl >= PayloadSize > AlwaysAvail, so prefix is unaffected

proc `&`*(a, b: SmallString): SmallString =
  result = a
  result.add(b)

proc toSmallString*(s: openArray[char]): SmallString =
  let l = s.len
  if l == 0: return
  if l <= PayloadSize:
    result.slen = byte(l)
    copyMem(cast[ptr UncheckedArray[char]](addr result.payload[0]), unsafeAddr s[0], l)
  else:
    let p = cast[ptr LongString](alloc(sizeof(int) * 3 + l + 1))
    p.rc = 1
    p.fullLen = l
    p.capImpl = (l shl 1) or 1
    copyMem(addr p.data[0], unsafeAddr s[0], l)
    copyMem(addr result.payload[0], unsafeAddr s[0], AlwaysAvail)
    result.slen = byte(PayloadSize + 1)
    result.more = p

proc toSmallString*(s: string): SmallString =
  toSmallString(s.toOpenArray(0, s.high))

proc `$`*(s: SmallString): string =
  let (l, p) = s.guts
  result = newString(l)
  if l > 0:
    copyMem(unsafeAddr result[0], p, l)

when isMainModule:
  var s = toSmallString("Hello, world!")
  s.add toSmallString("more")
  echo s

It's complete enough for you to tinker with but the real value would be in patching Nim 2 (or Nimony) to use this implementation for ARC/ORC.

Properties:

Turbo Pascal single byte length for nostalgia.

Size: 16 bytes, short strings up to length 15 stored directly, no heap allocations.

Long strings use copy-on-write with atomics, these should be rare enough that the sharing of memory pays off and the atomics don't matter. This needs benchmarking though, maybe it's overkill and a pessimization.

Long strings have a 7byte prefix stored inline so comparisons can fail quickly, often without any memory fetches.

mantielero (orginal) [2026-03-06T22:13:30+01:00] view original

I just feed your post and I got this: https://github.com/mantielero/sso_challenge

I am not sure if this is useful.

The AI said:


    Key Features Implemented
     - 16-byte size - fits in two CPU registers
     - Short strings (≤15 bytes) - stored inline, zero heap allocations
     - Long strings - copy-on-write with atomic reference counting
     - 7-byte inline prefix - fast comparison without memory fetches
     - Full API: creation, mutation, comparison, indexing, iteration
    
    Test Results
    ✅ All 30+ unit tests pass covering:
     - Basics, mutation, comparison, substring ops
     - COW semantics, iteration, utilities, edge cases
    
    Benchmark Highlights
    
    ┌──────────────┬────────────────────────────────────┐
    │ Operation    │ SmallString Performance            │
    ├──────────────┼────────────────────────────────────┤
    │ Short concat │ 16 ns/iter (2.7x faster than std)  │
    │ COW+modify   │ 50 ns/iter (competitive)           │
    │ Long cmp     │ 0.0006 ns/iter (fast prefix check) │
    └──────────────┴────────────────────────────────────┘

ASVI (orginal) [2026-03-06T23:21:33+01:00] view original

I don't quite understand: why is copy-on-write used instead of unique ownership? Ownership should be much cheaper for the cpu.

Araq (orginal) [2026-03-06T23:26:45+01:00] view original

String assignment conceptually copies, unique ownership doesn't solve anything, so people need to write var x = a[i].clone(), great but how about we make the clone cheap?

ASVI (orginal) [2026-03-06T23:49:08+01:00] view original

COW doesn't make copying cheap — it spreads the cost of copying across every mutating operation like []=, add, setLen, etc., each of which requires an atomic read of the reference count. Even in the read-only case reading from a container without mutation, seemingly the best case for COW but atomicAdd is still more expensive than a simple borrow (though cheaper than a full alloc + memcpy, of course). The only scenario where COW truly wins is when multiple copies of the same string live for a long time and are never mutated.

Araq (orginal) [2026-03-06T23:54:22+01:00] view original

Learn how to read. Let me quote myself:

"Long strings use copy-on-write with atomics, these should be rare enough that the sharing of memory pays off and the atomics don't matter. This needs benchmarking though, maybe it's overkill and a pessimization."

ASVI (orginal) [2026-03-07T00:04:07+01:00] view original

Yes, SSO should cover the majority of cases. But what are the actual use cases for long strings? A text buffer read from a file, for example: borrow is more than sufficient for it. If you need to append something, you mutate through a single owner, and reading is never a problem. Or take nif parsing, where you only need a few read-only cursors into the buffer. That said, benchmarks would indeed be needed to settle this.

Araq (orginal) [2026-03-07T08:24:38+01:00] view original

Nim 2.0 already introduced a mild form of COW so that string literals don't allocate and had to introduce an API for the low-level implications of this feature named prepareMutation.

Araq (orginal) [2026-03-07T08:25:17+01:00] view original

I just feed your post and I got this: https://github.com/mantielero/sso_challenge

Nice but the real challenge is in patching the compiler to use this new string implementation.

blackmius (orginal) [2026-03-07T08:50:40+01:00] view original

does it also need a lock for more pointer changes? looks like two process can at the same time get into ensureUniqueLong and one will rewrite another

or if it is under programmers concerns to have a lock then why it needs atomicFetches?

Araq (orginal) [2026-03-07T09:07:31+01:00] view original

Yes it needs a lock. Yes it needs the atomic ops regardless because the destructor typically doesn't run under a lock. The same reasoning applies for mm:atomicArc and why it works so much better for threading than mm:arc. I don't know why this keeps coming up, it's a settled debate.

Araq (orginal) [2026-03-07T17:28:47+01:00] view original

Some background, when I told AI to implement this design (never let the AI do the design for you) I had https://cedardb.com/blog/german_strings/ in mind, but misremembered how they do it, so I ended up with this very Turbo-Pascal-like string.

Araq (orginal) [2026-03-07T21:29:57+01:00] view original

Results are in! Suprisingly this implementation doesn't perform as well as Nim's standard impl:

-d:nimsso

inline limit=14 bytes count=200000 rounds=5 seed=20260307

scenarios=short,inline,boundary,long,prefix,mixed

short n= 200000 avgLen= 4.0 avg= 65.122 ms best= 63.881 ms worst= 65.791 ms ns/item= 325.6 check=0xB5FBD5BDA282A080

inline n= 200000 avgLen= 11.0 avg= 62.454 ms best= 62.064 ms worst= 63.806 ms ns/item= 312.3 check=0xDC9E93B8EC0D16FB

boundary n= 200000 avgLen= 14.0 avg= 63.960 ms best= 62.000 ms worst= 66.624 ms ns/item= 319.8 check=0xEF3A01A83B6F65ED

long n= 200000 avgLen= 46.5 avg= 91.931 ms best= 82.419 ms worst= 105.282 ms ns/item= 459.7 check=0x5B6CDCE0879911CB

prefix n= 200000 avgLen= 40.8 avg= 86.117 ms best= 77.114 ms worst= 98.043 ms ns/item= 430.6 check=0x31BC420D8721B18E: mixed n= 200000 avgLen= 16.8 avg= 64.490 ms best= 61.730 ms worst= 69.294 ms ns/item= 322.4 check=0x8E700DECF6589066

MAXMEM=37.656MiB

standard ARC strings

inline limit=14 bytes count=200000 rounds=5 seed=20260307

scenarios=short,inline,boundary,long,prefix,mixed

short n= 200000 avgLen= 4.0 avg= 38.840 ms best= 34.401 ms worst= 41.809 ms ns/item= 194.2 check=0xB5FBD5BDA282A080

inline n= 200000 avgLen= 11.0 avg= 42.450 ms best= 38.730 ms worst= 46.525 ms ns/item= 212.3 check=0xDC9E93B8EC0D16FB

boundary n= 200000 avgLen= 14.0 avg= 47.887 ms best= 43.499 ms worst= 51.839 ms ns/item= 239.4 check=0xEF3A01A83B6F65ED

long n= 200000 avgLen= 46.5 avg= 105.106 ms best= 92.936 ms worst= 113.502 ms ns/item= 525.5 check=0x5B6CDCE0879911CB

prefix n= 200000 avgLen= 40.8 avg= 65.873 ms best= 51.183 ms worst= 84.068 ms ns/item= 329.4 check=0x31BC420D8721B18E: mixed n= 200000 avgLen= 16.8 avg= 74.628 ms best= 63.042 ms worst= 87.556 ms ns/item= 373.1 check=0x8E700DECF6589066

MAXMEM=92.027MiB

50% memory savings are nothing to complain about though.

Araq (orginal) [2026-03-09T15:31:06+01:00] view original

SSO Benchmark Results

These numbers were collected on a Ryzen-class Linux machine by running the same benchmark programs with the default string implementation and again with -d:nimsso. The current SSO implementation still loses on raw cmp, but it now wins clearly on the higher-level sortbench and csvbench workloads, and it is mostly favorable on hashbench as well.

Benchmark	Scenario	Baseline	`-d:nimsso`	Change
`cmpbench`	`short`	3.4 ns/cmp	4.5 ns/cmp	1.32x slower
`cmpbench`	`inline`	3.5 ns/cmp	7.0 ns/cmp	2.00x slower
`cmpbench`	`boundary`	3.5 ns/cmp	10.0 ns/cmp	2.86x slower
`cmpbench`	`long`	9.3 ns/cmp	16.5 ns/cmp	1.77x slower
`cmpbench`	`mixed`	5.3 ns/cmp	12.1 ns/cmp	2.28x slower
`sortbench`	`short`	67.239 ms	22.287 ms	3.02x faster
`sortbench`	`inline`	91.306 ms	30.799 ms	2.96x faster
`sortbench`	`boundary`	71.634 ms	33.444 ms	2.14x faster
`sortbench`	`long`	121.103 ms	26.700 ms	4.54x faster
`sortbench`	`prefix`	96.590 ms	33.342 ms	2.90x faster
`hashbench`	`short`	60.8 ns/op	42.7 ns/op	1.42x faster
`hashbench`	`inline`	82.0 ns/op	115.1 ns/op	1.40x slower
`hashbench`	`boundary`	83.9 ns/op	78.7 ns/op	1.07x faster
`hashbench`	`long`	98.4 ns/op	68.1 ns/op	1.45x faster
`hashbench`	`prefix`	99.1 ns/op	60.2 ns/op	1.65x faster
`hashbench`	`mixed`	80.4 ns/op	67.2 ns/op	1.20x faster
`csvbench`	`rows=100000`	41.514 ms	31.992 ms	1.30x faster

In short: cmpbench shows that raw string comparison is still more expensive with SSO, but the end-to-end results are now quite encouraging. The strongest wins show up in sorting and CSV parsing, which suggests that reduced allocation pressure and better locality are starting to outweigh the remaining compare overhead in practical workloads.

Will publish the benchmarks later.

Implementation is at https://github.com/nim-lang/Nim/pull/25593

planetis (orginal) [2026-03-10T10:39:40+01:00] view original

These numbers are impressive. I remember that when I tried to build SSO strings, parseInt performance was about 6× slower, and sorting was also slow.

darkestpigeon (orginal) [2026-03-10T11:24:56+01:00] view original

Nice but the real challenge is in patching the compiler to use this new string implementation

Is it a good fit for the compiler? I though having an identifier table and only passing around something like integer IDs is the way. Comparison is trivial, lookup is also fast.

Araq (orginal) [2026-03-10T11:28:23+01:00] view original

Can't say, the compiler doesn't bootstrap with -d:nimsso and we need new APIs to make the low level interactions work with SSO properly...

Araq (orginal) [2026-03-12T15:01:57+01:00] view original

Bootstrapping with -d:nimsso works now and produces these numbers:

SSO: 191035 lines; 7.502s; 783.945MiB peakmem;
default: 190538 lines; 7.541s; 800.758MiB peakmem;

Saves 16MB for the compiler itself.

Zoom (orginal) [2026-03-12T22:25:01+01:00] view original

That's surprisingly little difference. Looks like congratulating on great coupling of current compiler implementation with design of existing data structure is in order.

Mirror of forum.nim-lang.org

13767 :: Vibe coding challenge: SSO for Nim 2

-d:nimsso

standard ARC strings

SSO Benchmark Results