nimforum mirror - These Lemire Benchmarks are really nice

Stefan_Salewski (orginal) [2018-08-23T17:00:35+02:00] view original

https://lemire.me/blog/2018/08/22/avoid-lexicographical-comparisons-when-testing-for-string-equality/

Output of running his code is


$ ./memcmpfun
mass_comparison(bigarray1, bigarray2,comparisonsA, N)       	:  8.296 cycles per operation (best)  2.411 bytes per cycle (best) 	43.689 cycles per operation (avg)
mass_comparison_fast(bigarray1, bigarray2,comparisonsB, N)  	:  3.778 cycles per operation (best)  5.294 bytes per cycle (best) 	4.356 cycles per operation (avg)
mass_comparison_faststruct(bigarray1, bigarray2,comparisonsB, N)	:  4.667 cycles per operation (best)  4.286 bytes per cycle (best) 	6.015 cycles per operation (avg)
mass_comparison_bcmp(bigarray1, bigarray2,comparisonsB, N)  	:  3.333 cycles per operation (best)  6.000 bytes per cycle (best) 	3.793 cycles per operation (avg)
mass_comparison_hash(bigarray1, bigarray2,comparisonsB, N)  	:  8.000 cycles per operation (best)  2.500 bytes per cycle (best) 	10.104 cycles per operation (avg)

I have no idea how he gets the exact cycle count -- but its all contained in 200 lines of code in file benchmark.h

Stefan_Salewski (orginal) [2018-08-24T09:29:56+02:00] view original

I recently saw advertisement of your lib, but had no spare time to test it yet.

Is there a chance to get cycle count output for your lib? ns output is fine for comparisons on the same box, but cycles are great for absolute value discussion.

mratsim (orginal) [2018-08-24T11:24:12+02:00] view original

I also would like to have rdtsc tests to catch perf regression, see: https://github.com/mratsim/Arraymancer/issues/135

@Stefan_Salewski, note that it's easy to wrap RDTSC, see https://gist.github.com/edubart/f6c92b1fdfca1c1e15ec34bb45f88595

import sequtils, random

proc c_malloc(size: csize): pointer {.importc: "malloc", header: "<stdlib.h>".}
proc c_aligned_alloc(alignment, size: csize): pointer {.importc: "aligned_alloc", header: "<stdlib.h>".}
proc c_free(p: pointer) {.importc: "free", header: "<stdlib.h>".}
proc mkl_malloc(size: csize, align: int): pointer {.importc: "mkl_malloc", header: "<mkl.h>".}
proc mkl_free(p: pointer) {.importc: "mkl_free", header: "<mkl.h>".}

{.passL:"-lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lm".}

proc rdtsc(): int64 =
  var hi, lo: uint32
  asm """
    rdtsc
    :"=a"(`lo`), "=d"(`hi`)
  """
  result = int64(lo) or (int64(hi) shl 32)
type
  CycleCounter = object
    start_cycles: int64
proc restart(self: var CycleCounter) {.inline.} =
  self.start_cycles = rdtsc()
proc elapsedCycles(self: CycleCounter): int64 {.inline.} =
  rdtsc() - self.start_cycles
proc iterationCycles(self: CycleCounter, iters: int64): float {.inline.} =
  self.elapsedCycles().float / iters.float

const ALIGNMENT = 64
const MAX_SIZE = 128*128*128
const MAX_ITERS = 1000000

var alloc_sizes = newSeqWith(MAX_ITERS, max((random(MAX_SIZE) div ALIGNMENT) * ALIGNMENT, ALIGNMENT))

proc main() =
  var cycler: CycleCounter
  cycler.restart()
  var c = 0.0f
  for size in alloc_sizes:
    let data = c_malloc(size*sizeof(float32))
    c += cast[ptr UncheckedArray[float32]](data)[0]
    c_free(data)
  echo "C malloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  cycler.restart()
  for size in alloc_sizes:
    let data = c_aligned_alloc(ALIGNMENT, size*sizeof(float32))
    c += cast[ptr UncheckedArray[float32]](data)[0]
    c_free(data)
  echo "C aligned_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  cycler.restart()
  for size in alloc_sizes:
    let data = allocShared(size*sizeof(float32))
    c += cast[ptr UncheckedArray[float32]](data)[0]
    deallocShared(data)
  echo "nim allocShared ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  cycler.restart()
  for size in alloc_sizes:
    let data = alloc(size*sizeof(float32))
    c += cast[ptr UncheckedArray[float32]](data)[0]
    dealloc(data)
  echo "nim alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  cycler.restart()
  for size in alloc_sizes:
    var data = newSeqOfCap[float32](size)
    data.setLen(size)
    c += data[0]
  echo "nim seq ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  cycler.restart()
  for size in alloc_sizes:
    let data = mkl_malloc(size*sizeof(float32), ALIGNMENT)
    c += cast[ptr UncheckedArray[float32]](data)[0]
    mkl_free(data)
  echo "mkl_alloc ", cycler.iterationCycles(MAX_ITERS), " cycles"
  
  echo "just printed to foll gcc: ", c

echo "--- initial ---"
main()
echo "--- after warmup ---"
main()

Stefan_Salewski (orginal) [2018-08-24T16:01:20+02:00] view original

Great, will try soon :-)

Mirror of forum.nim-lang.org

4153 :: These Lemire Benchmarks are really nice