Output of running his code is
$ ./memcmpfun
mass_comparison(bigarray1, bigarray2,comparisonsA, N) : 8.296 cycles per operation (best) 2.411 bytes per cycle (best) 43.689 cycles per operation (avg)
mass_comparison_fast(bigarray1, bigarray2,comparisonsB, N) : 3.778 cycles per operation (best) 5.294 bytes per cycle (best) 4.356 cycles per operation (avg)
mass_comparison_faststruct(bigarray1, bigarray2,comparisonsB, N) : 4.667 cycles per operation (best) 4.286 bytes per cycle (best) 6.015 cycles per operation (avg)
mass_comparison_bcmp(bigarray1, bigarray2,comparisonsB, N) : 3.333 cycles per operation (best) 6.000 bytes per cycle (best) 3.793 cycles per operation (avg)
mass_comparison_hash(bigarray1, bigarray2,comparisonsB, N) : 8.000 cycles per operation (best) 2.500 bytes per cycle (best) 10.104 cycles per operation (avg)
I have no idea how he gets the exact cycle count -- but its all contained in 200 lines of code in file benchmark.h
I recently saw advertisement of your lib, but had no spare time to test it yet.
Is there a chance to get cycle count output for your lib? ns output is fine for comparisons on the same box, but cycles are great for absolute value discussion.
I also would like to have rdtsc tests to catch perf regression, see: https://github.com/mratsim/Arraymancer/issues/135
@Stefan_Salewski, note that it's easy to wrap RDTSC, see https://gist.github.com/edubart/f6c92b1fdfca1c1e15ec34bb45f88595
import sequtils, random
proc c_malloc(size: csize): pointer {.importc: "malloc", header: "<stdlib.h>".}
proc c_aligned_alloc(alignment, size: csize): pointer {.importc: "aligned_alloc", header: "<stdlib.h>".}
proc c_free(p: pointer) {.importc: "free", header: "<stdlib.h>".}
proc mkl_malloc(size: csize, align: int): pointer {.importc: "mkl_malloc", header: "<mkl.h>".}
proc mkl_free(p: pointer) {.importc: "mkl_free", header: "<mkl.h>".}
{.passL:"-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm".}
proc rdtsc(): int64 =
var hi, lo: uint32
asm """
rdtsc
:"=a"(`lo`), "=d"(`hi`)
"""
result = int64(lo) or (int64(hi) shl 32)
type
CycleCounter = object
start_cycles: int64
proc restart(self: var CycleCounter) {.inline.} =
self.start_cycles = rdtsc()
proc elapsedCycles(self: CycleCounter): int64 {.inline.} =
rdtsc() - self.start_cycles
proc iterationCycles(self: CycleCounter, iters: int64): float {.inline.} =
self.elapsedCycles().float / iters.float
const ALIGNMENT = 64
const MAX_SIZE = 128*128*128
const MAX_ITERS = 1000000
var alloc_sizes = newSeqWith(MAX_ITERS, max((random(MAX_SIZE) div ALIGNMENT) * ALIGNMENT, ALIGNMENT))
proc main() =
var cycler: CycleCounter
cycler.restart()
var c = 0.0f
for size in alloc_sizes:
let data = c_malloc(size*sizeof(float32))
c += cast[ptr UncheckedArray[float32]](data)[0]
c_free(data)
echo "C malloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
cycler.restart()
for size in alloc_sizes:
let data = c_aligned_alloc(ALIGNMENT, size*sizeof(float32))
c += cast[ptr UncheckedArray[float32]](data)[0]
c_free(data)
echo "C aligned_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
cycler.restart()
for size in alloc_sizes:
let data = allocShared(size*sizeof(float32))
c += cast[ptr UncheckedArray[float32]](data)[0]
deallocShared(data)
echo "nim allocShared ", cycler.iterationCycles(MAX_ITERS), " cycles"
cycler.restart()
for size in alloc_sizes:
let data = alloc(size*sizeof(float32))
c += cast[ptr UncheckedArray[float32]](data)[0]
dealloc(data)
echo "nim alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
cycler.restart()
for size in alloc_sizes:
var data = newSeqOfCap[float32](size)
data.setLen(size)
c += data[0]
echo "nim seq ", cycler.iterationCycles(MAX_ITERS), " cycles"
cycler.restart()
for size in alloc_sizes:
let data = mkl_malloc(size*sizeof(float32), ALIGNMENT)
c += cast[ptr UncheckedArray[float32]](data)[0]
mkl_free(data)
echo "mkl_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
echo "just printed to foll gcc: ", c
echo "--- initial ---"
main()
echo "--- after warmup ---"
main()