nimforum mirror - How to proceed with low level optimisation?

Mścigniew (orginal) [2013-12-16T20:28:52+01:00] view original

Hello. I wrote a SipHash implementation. It's quite dirty because I'm very noobish, but it's not the main focus at the moment. It's not as fast as I'd like it to be.

Nimrod performance:

gcc 4.9: 2.5 cycles / byte

gcc 4.2: 2.75 cycles / byte

clang 3.4: 2.9 cycles / byte

clang 3.1: 2.9 cycles / byte

Equivalent C++ performance:

gcc 4.9: 3.4 cycles / byte

clang 3.4: 1.5 cycles / byte

C++ with optimised access:

gcc 4.9: 1.5 cycles / byte

clang 3.4: 1.5 cycles / byte

All on Phenom 2

The key seems to be data access. SipHash treats input as a sequence of little endian uint64 blocks. In Nimrod I gather them together with shifts and ors. In regular C++ I do the same. In optimised C++, I explicitly treat the data as a sequence of uint64s ignoring alignment issues as they don't exist on my CPU. It appears that Clang notices this fact in my C++ code and always generates direct accesses. gcc doesn't. Is there a way to detect the target platform at compile time and if it's AMD64 (or one of several others) issue direct accesses? I tried a cast[openarray[uint64]], but compiler told me:

Error: invalid type: 'openarray[uint64]'

For the reference, the nimrod code:

import unsigned
import strutils
import times

const rounds = 2
const final_rounds = 4

# only valid for various uints
proc rotl*[T](x: var T, shift: T) =
    const x_bits: T = sizeof(x) * 8
    x = (x shl shift) or (x shr (x_bits - shift))

proc `+=` (x: var uint64, y) =
    x = x + y

proc `^=` (x: var uint64, y) =
    x = x xor y

proc mix(v0, v1, v2, v3: var uint64) =
    v0 += v1
    v2 += v3
    rotl(v1, 13'u64)
    rotl(v3, 16'u64)
    v1 ^= v0
    v3 ^= v2
    rotl(v0, 32'u64)
    v2 += v1
    v0 += v3
    rotl(v1, 17'u64)
    rotl(v3, 21'u64)
    v1 ^= v2
    v3 ^= v0
    rotl(v2, 32'u64)

proc hash(input: openarray[uint8], size: int, key0, key1: uint64): uint64 =
    var v0: uint64 = key0 xor 0x736f6d6570736575'u64
    var v1: uint64 = key1 xor 0x646f72616e646f6d'u64
    var v2: uint64 = key0 xor 0x6c7967656e657261'u64
    var v3: uint64 = key1 xor 0x7465646279746573'u64
    var i: int = 0
    
    # the main loop
    while i + 7 < size:
        let word: uint64 =
            (cast[uint64](input[i+0]) shl  0) or
            (cast[uint64](input[i+1]) shl  8) or
            (cast[uint64](input[i+2]) shl 16) or
            (cast[uint64](input[i+3]) shl 24) or
            (cast[uint64](input[i+4]) shl 32) or
            (cast[uint64](input[i+5]) shl 40) or
            (cast[uint64](input[i+6]) shl 48) or
            (cast[uint64](input[i+7]) shl 56)
        
        v3 ^= word
        for j in countup(1, rounds):
            mix(v0, v1, v2, v3)
        v0 ^= word
        i += 8
    
    # gather the last 0..7 bytes
    var tail = cast[uint64](size) shl 56
    let bytes_left = size and 7 # and 7 is the same as mod 8
    if bytes_left > 0:
        tail = tail or  cast[uint64](input[i+0])
    if bytes_left > 1:
        tail = tail or (cast[uint64](input[i+1]) shl  8)
    if bytes_left > 2:
        tail = tail or (cast[uint64](input[i+2]) shl 16)
    if bytes_left > 3:
        tail = tail or (cast[uint64](input[i+3]) shl 24)
    if bytes_left > 4:
        tail = tail or (cast[uint64](input[i+4]) shl 32)
    if bytes_left > 5:
        tail = tail or (cast[uint64](input[i+5]) shl 40)
    if bytes_left > 6:
        tail = tail or (cast[uint64](input[i+6]) shl 48)
    # process the last bytes
    v3 ^= tail
    for j in countup(1, rounds):
        mix(v0, v1, v2, v3)
    v0 ^= tail
    v2 ^= 0xff'u64
    
    # the final mix
    for j in countup(1, final_rounds):
        mix(v0, v1, v2, v3)
    
    return v0 xor v1 xor v2 xor v3

const chunk_size = 32768
var a: array[0..(chunk_size - 1), uint8]
var b: uint64
var c: uint64
var best_time: float = 0
const small_iters = 10000
const big_iters = 10
for i in countup(1, big_iters):
    var start_time = cpuTime()
    for i in countup(1, small_iters):
        a[0] = cast[uint8](i)
        b = hash(a, chunk_size, 0'u64, 0'u64)
        c = b xor c
    var total_time = cpuTime() - start_time
    
    if best_time == 0 or best_time > total_time:
        best_time = total_time
var speed: float = (best_time * 3200000000'f64) / (chunk_size * small_iters)  # ticks / byte
echo formatFloat(speed, ffDecimal, 6), " ticks / byte"


const test_vectors: array[0..63, uint64] = [
    0x726fdb47dd0e0e31'u64,0x74f839c593dc67fd'u64,0x0d6c8009d9a94f5a'u64,0x85676696d7fb7e2d'u64,
    0xcf2794e0277187b7'u64,0x18765564cd99a68d'u64,0xcbc9466e58fee3ce'u64,0xab0200f58b01d137'u64,
    0x93f5f5799a932462'u64,0x9e0082df0ba9e4b0'u64,0x7a5dbbc594ddb9f3'u64,0xf4b32f46226bada7'u64,
    0x751e8fbc860ee5fb'u64,0x14ea5627c0843d90'u64,0xf723ca908e7af2ee'u64,0xa129ca6149be45e5'u64,
    0x3f2acc7f57c29bdb'u64,0x699ae9f52cbe4794'u64,0x4bc1b3f0968dd39c'u64,0xbb6dc91da77961bd'u64,
    0xbed65cf21aa2ee98'u64,0xd0f2cbb02e3b67c7'u64,0x93536795e3a33e88'u64,0xa80c038ccd5ccec8'u64,
    0xb8ad50c6f649af94'u64,0xbce192de8a85b8ea'u64,0x17d835b85bbb15f3'u64,0x2f2e6163076bcfad'u64,
    0xde4daaaca71dc9a5'u64,0xa6a2506687956571'u64,0xad87a3535c49ef28'u64,0x32d892fad841c342'u64,
    0x7127512f72f27cce'u64,0xa7f32346f95978e3'u64,0x12e0b01abb051238'u64,0x15e034d40fa197ae'u64,
    0x314dffbe0815a3b4'u64,0x027990f029623981'u64,0xcadcd4e59ef40c4d'u64,0x9abfd8766a33735c'u64,
    0x0e3ea96b5304a7d0'u64,0xad0c42d6fc585992'u64,0x187306c89bc215a9'u64,0xd4a60abcf3792b95'u64,
    0xf935451de4f21df2'u64,0xa9538f0419755787'u64,0xdb9acddff56ca510'u64,0xd06c98cd5c0975eb'u64,
    0xe612a3cb9ecba951'u64,0xc766e62cfcadaf96'u64,0xee64435a9752fe72'u64,0xa192d576b245165a'u64,
    0x0a8787bf8ecb74b2'u64,0x81b3e73d20b49b6f'u64,0x7fa8220ba3b2ecea'u64,0x245731c13ca42499'u64,
    0xb78dbfaf3a8d83bd'u64,0xea1ad565322a1a0b'u64,0x60e61c23a3795013'u64,0x6606d7e446282b93'u64,
    0x6ca4ecb15c5f91e1'u64,0x9f626da15c9625f3'u64,0xe51b38608ef25f57'u64,0x958a324ceb064572'u64]

var key0: uint64 = 0x0706050403020100'u64
var key1: uint64 = 0x0f0e0d0c0b0a0908'u64

var all_match = true
for i in countup(0, 63):
    a[i] = cast[uint8](i)
    b = hash(a, i, key0, key1)
    
    if b != test_vectors[i]:
        echo "Test vector ", i, " failed"
        all_match = false
        break
if all_match:
    echo "Self-test succeeded"
# make sure that compiler won't optimise the hashing calls out
# c is not 0, so nothing will be printed
if c == 0:
    echo c

filwit (orginal) [2013-12-17T15:13:21+01:00] view original

Hi. Sorry, I don't have the time to answer all your questions or proof your code ATM, but just to be sure since I didn't see it in your post, you are compiling for release ($ nimrod c -d:release yourcode.nim) right?

Also, 'openarray' is only meant for parameters, which might be why attempts to cast to it aren't working. I don't know all the details, but you can read more here: http://nimrod-lang.org/manual.html#openarray_1691731766

Mścigniew (orginal) [2013-12-17T20:12:17+01:00] view original

Unfortunately, adding:

var x = cast[array[0..((size / 8) - 1), uint8]](input)

Results in:

siphash.nim(41, 39) Error: constant expression expected

I decided to try to make the array of a constant, but large size, with intention of using max int or so:

var input64 = cast[array[0..99999, uint64]](input)

But it failed at runtime:

Traceback (most recent call last)
siphash.nim(96)          siphash
siphash.nim(41)          hash
SIGSEGV: Illegal storage access. (Attempt to read from nil?)
Hint: operation successful (10141 lines compiled; 0.707 sec total; 17.174MB) [SuccessX]

Line 41 is the cast, not subsequent usage.

Yes, I'm compiling with release, otherwise it takes 190 cycles / byte. I also tried pragmas, but they did nothing.

filwit (orginal) [2013-12-17T21:44:32+01:00] view original

I'm not sure if it will help you, but you would need a dynamic array to avoid the siphash.nim error. Dynamic arrays in Nimrod are called 'Seq' (in case you didn't know). I'm not sure exactly about it's internal memory structure, so IDK about directly casting to/from it, but you can create one like:

var s = newSeq[uint64](size)
var s = @[a, b, c, ...] # shorthand syntax

Araq (orginal) [2013-12-18T01:15:33+01:00] view original

Put all of your code in a 'main' proc. Otherwise you get plenty of global variables which might slow things down.

casting to a ptr array should work: var input64 = cast[ptr array[0..99999, uint64]](input)

skyfex (orginal) [2013-12-18T10:31:35+01:00] view original

In Javascript there's a ArrayBufferView that can provide different data views on the same array.. I.e. you can read uint8's or uint32's from the same array:

https://developer.mozilla.org/en-US/docs/Web/API/ArrayBufferView

I think something like that would have helped you in this situation, while keeping the code safe. Maybe it would be possible to create an arrayview module for Nimrod?

Mścigniew (orginal) [2013-12-19T21:10:39+01:00] view original

Thanks, it got me down to:

2.2 ticks/B with gcc 4.2

1.45 ticks/B with gcc 4.9 / Clang 3.4

I got to study the C code, I'm shocked because I benchmarked nearly every C(++) implementation of SipHash and none managed to beat mine (which is in fact based on the fastest out there). What can I say, congratulations and thank you.

However, there's another part of my question. This code is going to crash on most machines because of alignment issues. Is there a way to detect the target platform at runtime?

Mścigniew (orginal) [2013-12-20T21:28:49+01:00] view original

End of mystery. Time measurements with cpuTime() turned out to be too imprecise. After I modified the C code generated by nimrod compiler to use rdtsc instruction like my C++ benchmark does, I got 1.5089 ticks/byte, exactly the same as with the best C(++) implementations.

A great result, I think.

filwit (orginal) [2013-12-20T22:23:13+01:00] view original

FYI: Nimrod 'defaults' memory of vars and the proc 'result'. The {.noInit.} pragma prevents that (making unset vars garbage, like C/C++).

I'm not sure it will help since you're not using 'result' (just the return keyword), but one minor optimization you could try is to put a {.noInit.} pragma on the 'hash' proc. I'm unaware of all the semantic analysis Nimrod does here, so again, may not help at all, but it's worth a try.

Mścigniew (orginal) [2013-12-20T22:49:41+01:00] view original

I've tried most of pragrams described in the manual, noinit included, before. No difference at all. However, I have 4 steps to make with this hash:

Implement platform analysis, so I can keep portability while using unaligned access where possible

make access alignment-dependent where possible. If I accidentally got an n-byte aligned buffer I can use n-byte words for reading on platforms that don't support unaligned reads.

optimise for short buffers, so far I use 32 KB (half of L1 cache of my CPU). C could probably do <30 ticks with 1 byte input. How about Nimrod? TODO.

make it work well when the hash is in an compilation unit different from the one that uses it

The 2 mentioned last are ones that are likely to benefit from pragmas and if I continue this far (I have the 2nd task on my mind already, use macros to create a for-else loop), I will surely revisit all macros. Thanks anyway.

Mścigniew (orginal) [2014-01-20T00:53:37+01:00] view original

I did some more comprehensive benchmarks, my nimrod implementation, 15 in C, 2 in C++, 3 in Java, 3 in Python. This time I measured performance on both short and long messages. Nimrod-generated C code won on short messages and scored second on long ones, 5% behind one that turns out very well understood by gcc (and much worse by clang). Vs. that particular implementation, performance on 12-byte strings was 10% better.

I haven't done any optimizations except for the already mentioned forcing of unaligned reads, even .noInit., they proved unnecessary.

Call me impressed.

Anyway, I'd like to repeat a question that seems to have been missed: Is there a way to do compile-time platform detection? I'd like to do unaligned reads only when I can make sure it's safe.

BTW, Java turned out to be c.a. 3 times slower for long and 3000 time slower for short (12 B) messages. Python - 2000 times slower for both short and long messages. I look for errors, but so far I don't see any. I have certainly spent more time optimizing Java and (especially) Python than Nimrod so far.

filwit (orginal) [2014-01-22T07:13:05+01:00] view original

Mścigniew said: Is there a way to do compile-time platform detection?

You mean like


when defined(Linux):
  ...
elif defined(Windows):
  ...

Mścigniew (orginal) [2014-01-22T17:25:10+01:00] view original

Yes, but what I need to know is not an OS but minimal alignment of uint64 or - strictly better - minimal_alignment(T) for each type T. Knowing CPU architecture would be good enough (though not perfect; with ARM alignment varies, so I'd have to be conservative).

filwit (orginal) [2014-01-22T20:04:52+01:00] view original

I'm not sure how to look that up, though i'm sure it's probably possible, and I'm surprised no one better informed than I has answered your question yet. But if all else fails, you can always pass your own compiler switches with -d:yourdef. Example:


when defined(ArchX86):
  ...
elif defined(ArchARM):
  ...

and compile with nimrod c -d:ArchX86 ..., etc.. Sorry I can't be of more help. To get a better answer, try hoping on the IRC and asking there.

Mścigniew (orginal) [2014-01-22T20:45:54+01:00] view original

Thanks. However I've seen this option already. It's a particularly bad one because I want to write a library. As such, I can enable the user to provide me with alignment specifics, but I shouldn't require it; for usability it's much better to use safe defaults. And in this case safety halves speed on AMD64.

Varriount (orginal) [2014-01-22T20:48:22+01:00] view original

I don't understand what you need precisely, but if you use 'nimrod dump <filename>'* on a compilable file, you'll get a printout of the currently defined symbols, as well as the search paths for libraries. On my Win64 machine, I have the 'x8664', 'cpu64', 'littleendian', and 'amd64' symbols defined - perhaps those symbols will help?

Note - the 'dump' command is undocumented, and may go away in the future

Mścigniew (orginal) [2014-01-22T21:45:24+01:00] view original

Thanks for the hint. I get those too. 'amd64' and 'x8664' look to be what I need, though I wondered why are there 2 that usually mean the same and grepped the sources.

I found 3 places that define platform-specific stuff, each different. compiler/platforms.nim looks like the right one:

(name: "i386", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "m68k", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "alpha", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "powerpc", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "powerpc64", intSize: 64, endian: bigEndian, floatSize: 64,bit: 64),
(name: "sparc", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "vm", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "ia64", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "amd64", intSize: 64, endian: littleEndian, floatSize: 64, bit: 64),
(name: "mips", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "arm", intSize: 32, endian: littleEndian, floatSize: 64, bit: 32),
(name: "js", intSize: 32, endian: bigEndian,floatSize: 64,bit: 32),
(name: "nimrodvm", intSize: 32, endian: bigEndian, floatSize: 64, bit: 32),
(name: "avr", intSize: 16, endian: littleEndian, floatSize: 32, bit: 16)]

I wonder what are vm and nimrodvm....

ADDED: And what's the difference between 'bit' and 'intSize'. Last time I checked, avr was 8-bit...

Araq (orginal) [2014-01-23T09:34:11+01:00] view original

The Avr target uses 16 bit integers, like Ansi C does. vm and nimrodvm seem to be the same and used to be used when you interpret nimrod code via nimrod i, I think the compiler doesn't do that anymore.

Mirror of forum.nim-lang.org

320 :: How to proceed with low level optimisation?