Hello. I wrote a SipHash implementation. It's quite dirty because I'm very noobish, but it's not the main focus at the moment. It's not as fast as I'd like it to be.
Nimrod performance:
Equivalent C++ performance:
C++ with optimised access:
All on Phenom 2
The key seems to be data access. SipHash treats input as a sequence of little endian uint64 blocks. In Nimrod I gather them together with shifts and ors. In regular C++ I do the same. In optimised C++, I explicitly treat the data as a sequence of uint64s ignoring alignment issues as they don't exist on my CPU. It appears that Clang notices this fact in my C++ code and always generates direct accesses. gcc doesn't. Is there a way to detect the target platform at compile time and if it's AMD64 (or one of several others) issue direct accesses? I tried a cast[openarray[uint64]], but compiler told me:
Error: invalid type: 'openarray[uint64]'
import unsigned
import strutils
import times
const rounds = 2
const final_rounds = 4
# only valid for various uints
proc rotl*[T](x: var T, shift: T) =
const x_bits: T = sizeof(x) * 8
x = (x shl shift) or (x shr (x_bits - shift))
proc `+=` (x: var uint64, y) =
x = x + y
proc `^=` (x: var uint64, y) =
x = x xor y
proc mix(v0, v1, v2, v3: var uint64) =
v0 += v1
v2 += v3
rotl(v1, 13'u64)
rotl(v3, 16'u64)
v1 ^= v0
v3 ^= v2
rotl(v0, 32'u64)
v2 += v1
v0 += v3
rotl(v1, 17'u64)
rotl(v3, 21'u64)
v1 ^= v2
v3 ^= v0
rotl(v2, 32'u64)
proc hash(input: openarray[uint8], size: int, key0, key1: uint64): uint64 =
var v0: uint64 = key0 xor 0x736f6d6570736575'u64
var v1: uint64 = key1 xor 0x646f72616e646f6d'u64
var v2: uint64 = key0 xor 0x6c7967656e657261'u64
var v3: uint64 = key1 xor 0x7465646279746573'u64
var i: int = 0
# the main loop
while i + 7 < size:
let word: uint64 =
(cast[uint64](input[i+0]) shl 0) or
(cast[uint64](input[i+1]) shl 8) or
(cast[uint64](input[i+2]) shl 16) or
(cast[uint64](input[i+3]) shl 24) or
(cast[uint64](input[i+4]) shl 32) or
(cast[uint64](input[i+5]) shl 40) or
(cast[uint64](input[i+6]) shl 48) or
(cast[uint64](input[i+7]) shl 56)
v3 ^= word
for j in countup(1, rounds):
mix(v0, v1, v2, v3)
v0 ^= word
i += 8
# gather the last 0..7 bytes
var tail = cast[uint64](size) shl 56
let bytes_left = size and 7 # and 7 is the same as mod 8
if bytes_left > 0:
tail = tail or cast[uint64](input[i+0])
if bytes_left > 1:
tail = tail or (cast[uint64](input[i+1]) shl 8)
if bytes_left > 2:
tail = tail or (cast[uint64](input[i+2]) shl 16)
if bytes_left > 3:
tail = tail or (cast[uint64](input[i+3]) shl 24)
if bytes_left > 4:
tail = tail or (cast[uint64](input[i+4]) shl 32)
if bytes_left > 5:
tail = tail or (cast[uint64](input[i+5]) shl 40)
if bytes_left > 6:
tail = tail or (cast[uint64](input[i+6]) shl 48)
# process the last bytes
v3 ^= tail
for j in countup(1, rounds):
mix(v0, v1, v2, v3)
v0 ^= tail
v2 ^= 0xff'u64
# the final mix
for j in countup(1, final_rounds):
mix(v0, v1, v2, v3)
return v0 xor v1 xor v2 xor v3
const chunk_size = 32768
var a: array[0..(chunk_size - 1), uint8]
var b: uint64
var c: uint64
var best_time: float = 0
const small_iters = 10000
const big_iters = 10
for i in countup(1, big_iters):
var start_time = cpuTime()
for i in countup(1, small_iters):
a[0] = cast[uint8](i)
b = hash(a, chunk_size, 0'u64, 0'u64)
c = b xor c
var total_time = cpuTime() - start_time
if best_time == 0 or best_time > total_time:
best_time = total_time
var speed: float = (best_time * 3200000000'f64) / (chunk_size * small_iters) # ticks / byte
echo formatFloat(speed, ffDecimal, 6), " ticks / byte"
const test_vectors: array[0..63, uint64] = [
0x726fdb47dd0e0e31'u64,0x74f839c593dc67fd'u64,0x0d6c8009d9a94f5a'u64,0x85676696d7fb7e2d'u64,
0xcf2794e0277187b7'u64,0x18765564cd99a68d'u64,0xcbc9466e58fee3ce'u64,0xab0200f58b01d137'u64,
0x93f5f5799a932462'u64,0x9e0082df0ba9e4b0'u64,0x7a5dbbc594ddb9f3'u64,0xf4b32f46226bada7'u64,
0x751e8fbc860ee5fb'u64,0x14ea5627c0843d90'u64,0xf723ca908e7af2ee'u64,0xa129ca6149be45e5'u64,
0x3f2acc7f57c29bdb'u64,0x699ae9f52cbe4794'u64,0x4bc1b3f0968dd39c'u64,0xbb6dc91da77961bd'u64,
0xbed65cf21aa2ee98'u64,0xd0f2cbb02e3b67c7'u64,0x93536795e3a33e88'u64,0xa80c038ccd5ccec8'u64,
0xb8ad50c6f649af94'u64,0xbce192de8a85b8ea'u64,0x17d835b85bbb15f3'u64,0x2f2e6163076bcfad'u64,
0xde4daaaca71dc9a5'u64,0xa6a2506687956571'u64,0xad87a3535c49ef28'u64,0x32d892fad841c342'u64,
0x7127512f72f27cce'u64,0xa7f32346f95978e3'u64,0x12e0b01abb051238'u64,0x15e034d40fa197ae'u64,
0x314dffbe0815a3b4'u64,0x027990f029623981'u64,0xcadcd4e59ef40c4d'u64,0x9abfd8766a33735c'u64,
0x0e3ea96b5304a7d0'u64,0xad0c42d6fc585992'u64,0x187306c89bc215a9'u64,0xd4a60abcf3792b95'u64,
0xf935451de4f21df2'u64,0xa9538f0419755787'u64,0xdb9acddff56ca510'u64,0xd06c98cd5c0975eb'u64,
0xe612a3cb9ecba951'u64,0xc766e62cfcadaf96'u64,0xee64435a9752fe72'u64,0xa192d576b245165a'u64,
0x0a8787bf8ecb74b2'u64,0x81b3e73d20b49b6f'u64,0x7fa8220ba3b2ecea'u64,0x245731c13ca42499'u64,
0xb78dbfaf3a8d83bd'u64,0xea1ad565322a1a0b'u64,0x60e61c23a3795013'u64,0x6606d7e446282b93'u64,
0x6ca4ecb15c5f91e1'u64,0x9f626da15c9625f3'u64,0xe51b38608ef25f57'u64,0x958a324ceb064572'u64]
var key0: uint64 = 0x0706050403020100'u64
var key1: uint64 = 0x0f0e0d0c0b0a0908'u64
var all_match = true
for i in countup(0, 63):
a[i] = cast[uint8](i)
b = hash(a, i, key0, key1)
if b != test_vectors[i]:
echo "Test vector ", i, " failed"
all_match = false
break
if all_match:
echo "Self-test succeeded"
# make sure that compiler won't optimise the hashing calls out
# c is not 0, so nothing will be printed
if c == 0:
echo c
Hi. Sorry, I don't have the time to answer all your questions or proof your code ATM, but just to be sure since I didn't see it in your post, you are compiling for release ($ nimrod c -d:release yourcode.nim) right?
Also, 'openarray' is only meant for parameters, which might be why attempts to cast to it aren't working. I don't know all the details, but you can read more here: http://nimrod-lang.org/manual.html#openarray_1691731766
var x = cast[array[0..((size / 8) - 1), uint8]](input)
Results in:
siphash.nim(41, 39) Error: constant expression expected
I decided to try to make the array of a constant, but large size, with intention of using max int or so:
var input64 = cast[array[0..99999, uint64]](input)
But it failed at runtime:
Traceback (most recent call last) siphash.nim(96) siphash siphash.nim(41) hash SIGSEGV: Illegal storage access. (Attempt to read from nil?) Hint: operation successful (10141 lines compiled; 0.707 sec total; 17.174MB) [SuccessX]
Line 41 is the cast, not subsequent usage.
Yes, I'm compiling with release, otherwise it takes 190 cycles / byte. I also tried pragmas, but they did nothing.
var s = newSeq[uint64](size)
var s = @[a, b, c, ...] # shorthand syntax
In Javascript there's a ArrayBufferView that can provide different data views on the same array.. I.e. you can read uint8's or uint32's from the same array:
https://developer.mozilla.org/en-US/docs/Web/API/ArrayBufferView
I think something like that would have helped you in this situation, while keeping the code safe. Maybe it would be possible to create an arrayview module for Nimrod?
I got to study the C code, I'm shocked because I benchmarked nearly every C(++) implementation of SipHash and none managed to beat mine (which is in fact based on the fastest out there). What can I say, congratulations and thank you.
However, there's another part of my question. This code is going to crash on most machines because of alignment issues. Is there a way to detect the target platform at runtime?
End of mystery. Time measurements with cpuTime() turned out to be too imprecise. After I modified the C code generated by nimrod compiler to use rdtsc instruction like my C++ benchmark does, I got 1.5089 ticks/byte, exactly the same as with the best C(++) implementations.
A great result, I think.
FYI: Nimrod 'defaults' memory of vars and the proc 'result'. The {.noInit.} pragma prevents that (making unset vars garbage, like C/C++).
I'm not sure it will help since you're not using 'result' (just the return keyword), but one minor optimization you could try is to put a {.noInit.} pragma on the 'hash' proc. I'm unaware of all the semantic analysis Nimrod does here, so again, may not help at all, but it's worth a try.
The 2 mentioned last are ones that are likely to benefit from pragmas and if I continue this far (I have the 2nd task on my mind already, use macros to create a for-else loop), I will surely revisit all macros. Thanks anyway.
I did some more comprehensive benchmarks, my nimrod implementation, 15 in C, 2 in C++, 3 in Java, 3 in Python. This time I measured performance on both short and long messages. Nimrod-generated C code won on short messages and scored second on long ones, 5% behind one that turns out very well understood by gcc (and much worse by clang). Vs. that particular implementation, performance on 12-byte strings was 10% better.
I haven't done any optimizations except for the already mentioned forcing of unaligned reads, even .noInit., they proved unnecessary.
Call me impressed.
Anyway, I'd like to repeat a question that seems to have been missed: Is there a way to do compile-time platform detection? I'd like to do unaligned reads only when I can make sure it's safe.
BTW, Java turned out to be c.a. 3 times slower for long and 3000 time slower for short (12 B) messages. Python - 2000 times slower for both short and long messages. I look for errors, but so far I don't see any. I have certainly spent more time optimizing Java and (especially) Python than Nimrod so far.
Mścigniew said: Is there a way to do compile-time platform detection?
You mean like
when defined(Linux):
...
elif defined(Windows):
...
?I'm not sure how to look that up, though i'm sure it's probably possible, and I'm surprised no one better informed than I has answered your question yet. But if all else fails, you can always pass your own compiler switches with -d:yourdef. Example:
when defined(ArchX86):
...
elif defined(ArchARM):
...
and compile with nimrod c -d:ArchX86 ..., etc.. Sorry I can't be of more help. To get a better answer, try hoping on the IRC and asking there.
I don't understand what you need precisely, but if you use 'nimrod dump <filename>'* on a compilable file, you'll get a printout of the currently defined symbols, as well as the search paths for libraries. On my Win64 machine, I have the 'x8664', 'cpu64', 'littleendian', and 'amd64' symbols defined - perhaps those symbols will help?
Note - the 'dump' command is undocumented, and may go away in the future
Thanks for the hint. I get those too. 'amd64' and 'x8664' look to be what I need, though I wondered why are there 2 that usually mean the same and grepped the sources.
I found 3 places that define platform-specific stuff, each different. compiler/platforms.nim looks like the right one:
(name: "i386", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "m68k", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "alpha", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "powerpc", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "powerpc64", intSize: 64, endian: bigEndian, floatSize: 64,bit: 64),
(name: "sparc", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "vm", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "ia64", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "amd64", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "mips", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "arm", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "js", intSize: 32, endian: bigEndian,floatSize: 64,bit: 32),
(name: "nimrodvm", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "avr", intSize: 16, endian: littleEndian, floatSize: 32, bit: 16)]
I wonder what are vm and nimrodvm....
ADDED: And what's the difference between 'bit' and 'intSize'. Last time I checked, avr was 8-bit...