nimforum mirror - Practical examples showing how macros lead to better code?

mratsim (orginal) [2019-12-12T17:45:57+01:00] view original

My matrix multiplication kernels reach the speed of industry grade pure Assembly libraries in pure Nim:

https://github.com/numforge/laser/tree/d1e6ae61/laser/primitives/matrix_multiplication

Adding a new type or CPU architecture is very easy thanks to macros:

ukernel_generator(
    x86_AVX512,
    typ = float32,
    vectype = m512,
    nb_scalars = 16,
    simd_setZero = mm512_setzero_ps,
    simd_broadcast_value = mm512_set1_ps,
    simd_load_aligned = mm512_load_ps,
    simd_load_unaligned = mm512_loadu_ps,
    simd_store_unaligned = mm512_storeu_ps,
    simd_mul = mm512_mul_ps,
    simd_add = mm512_add_ps,
    simd_fma = mm512_fmadd_ps
  )

In comparison, OpenBLAS has to create a 7000 assembly file for each architecture (https://github.com/xianyi/OpenBLAS/blob/1a6ea8ee6/kernel/x86_64/sgemm_kernel_16x4_skylakex.S) and the repo hosts a lot of those: https://github.com/xianyi/OpenBLAS/tree/develop/kernel/x86_64.

For a more recent example XNNPACK was started a year ago, uses C "templates" https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm that a Python build script transforms into actual C code later: https://github.com/google/XNNPACK/tree/73ccfb4e/src/f32-gemm/gen-inc

Other example: the syntax tensor slicing syntax in Arraymancer is impossible in a statically typed language without macros and probably one of the main reason scientists are not using raw C++ or any other new languages for scientitific computing:

macro `[]`*[T](t: AnyTensor[T], args: varargs[untyped]): untyped =
  ## Slice a Tensor or a CudaTensor
  ## Input:
  ##   - a Tensor or a CudaTensor
  ##   - and:
  ##     - specific coordinates (``varargs[int]``)
  ##     - or a slice (cf. tutorial)
  ## Returns:
  ##   - a value or a tensor corresponding to the slice
  ## Warning ⚠ CudaTensor temporary default:
  ##   For CudaTensor only, this is a no-copy operation, data is shared with the input.
  ##   This proc does not guarantee that a ``let`` value is immutable.
  ## Usage:
  ##    - Basic indexing - foo[2, 3]
  ##    - Basic indexing - foo[1+1, 2*2*1]
  ##    - Basic slicing - foo[1..2, 3]
  ##    - Basic slicing - foo[1+1..4, 3-2..2]
  ##    - Span slices - foo[_, 3]
  ##    - Span slices - foo[1.._, 3]
  ##    - Span slices - foo[_..3, 3]
  ##    - Span slices - foo[_.._, 3]
  ##    - Stepping - foo[1..3\|2, 3]
  ##    - Span stepping - foo[_.._\|2, 3]
  ##    - Span stepping - foo[_.._\|+2, 3]
  ##    - Span stepping - foo[1.._\|1, 2..3]
  ##    - Span stepping - foo[_..<4\|2, 3]
  ##    - Slicing until at n from the end - foo[0..^4, 3]
  ##    - Span Slicing until at n from the end - foo[_..^2, 3]
  ##    - Stepped Slicing until at n from the end - foo[1..^1\|2, 3]
  ##    - Slice from the end - foo[^1..0\|-1, 3]
  ##    - Slice from the end - expect non-negative step error - foo[^1..0, 3]
  ##    - Slice from the end - foo[^(2*2)..2*2, 3]
  ##    - Slice from the end - foo[^3..^2, 3]

A last example from the scientific domain, this neural network declaration syntax would not be possible without macros (yes neural networks can learn FizzBuzz ;))

const
  NumDigits = 10
  NumHidden = 100

let ctx = newContext Tensor[float32]

network ctx, FizzBuzzNet:
  layers:
    hidden: Linear(NumDigits, NumHidden)
    output: Linear(NumHidden, 4)
  forward x:
    x.hidden.relu.output

let model = ctx.init(FizzBuzzNet)
let optim = model.optimizerSGD(0.05'f32)

# ....
echo answer
# @["1", "2", "fizz", "4", "buzz", "6", "7", "8", "fizz", "10",
#   "11", "12", "13", "14", "15", "16", "17", "fizz", "19", "buzz",
#   "fizz", "22", "23", "24", "buzz", "26", "fizz", "28", "29", "30",
#   "31", "32", "fizz", "34", "buzz", "36", "37", "38", "39", "40",
#   "41", "fizz", "43", "44", "fizzbuzz", "46", "47", "fizz", "49", "50",
#   "fizz", "52","53", "54", "buzz", "56", "fizz", "58", "59", "fizzbuzz",
#   "61", "62", "63", "64", "buzz", "fizz", "67", "68", "fizz", "buzz",
#   "71", "fizz", "73", "74", "75", "76", "77","fizz", "79", "buzz",
#   "fizz", "82", "83", "fizz", "buzz", "86", "fizz", "88", "89", "90",
#   "91", "92", "fizz", "94", "buzz", "fizz", "97", "98", "fizz", "buzz"]

Now outside of the scientific computing, macros are great to generate instructions tables for an Assembler or a Jit, for example https://github.com/numforge/laser/blob/d1e6ae61/laser/photon_jit/x86_64/x86_64_ops.nim


op_generator:
  op MOV: # MOV(dst, src) load/copy src into destination
    ## Copy 64-bit register content to another register
    [dst64, src64]: [rex(w=1), 0x89, modrm(Direct, reg = src64, rm = dst64)]
    ## Copy 32-bit register content to another register
    [dst32, src32]: [          0x89, modrm(Direct, reg = src32, rm = dst32)]
    ## Copy 16-bit register content to another register
    [dst16, src16]: [    0x66, 0x89, modrm(Direct, reg = src16, rm = dst16)]
    
    ## Copy  8-bit register content to another register
    [dst8,  src8]:  [          0x88, modrm(Direct, reg = src8, rm = dst8)]
    
    ## Copy 64-bit immediate value into register
    [dst64, imm64]: [rex(w=1), 0xB8 + dst64] & imm64
    ## Copy 32-bit immediate value into register
    [dst64, imm32]: [          0xB8 + dst64] & imm32
    ## Copy 16-bit immediate value into register
    [dst64, imm16]: [    0x66, 0xB8 + dst64] & imm16
    
    ## Copy 32-bit immediate value into register
    [dst32, imm32]: [          0xB8 + dst32] & imm32
    ## Copy 16-bit immediate value into register
    [dst32, imm16]: [    0x66, 0xB8 + dst32] & imm16
    
    ## Copy 16-bit immediate value into register
    [dst16, imm16]: [    0x66, 0xB8 + dst16] & imm16
    ## Copy  8-bit immediate value into register
    [dst8,  imm8]:  [          0xB0 + dst8, imm8]
  
  op LEA:
    ## Load effective address of the target label into a register
    [dst64, label]: [rex(w=1), 0x8D, modrm(Direct, reg = dst64, rm = rbp)]
  
  op CMP:
    ## Compare 32-bit immediate with 32-bit int at memory location stored in adr register
    [adr, imm64]: [ rex(w=1), 0x81, modrm(Indirect, opcode_ext = 7, rm = adr[0])] & imm64
    ## Compare 32-bit immediate with 32-bit int at memory location stored in adr register
    [adr, imm32]: [           0x81, modrm(Indirect, opcode_ext = 7, rm = adr[0])] & imm32
    ## Compare 16-bit immediate with 16-bit int at memory location stored in adr register
    [adr, imm16]: [     0x66, 0x81, modrm(Indirect, opcode_ext = 7, rm = adr[0])] & imm16
    ## Compare 8-bit immediate with byte at memory location stored in adr register
    [adr, imm8]:  [           0x80, modrm(Indirect, opcode_ext = 7, rm = adr[0]), imm8]
  
  op JZ:
    ## Jump to label if zero flag is set
    [label]: [0x0F, 0x84]
  op JNZ:
    ## Jump to label if zero flag is not set
    [label]: [0x0F, 0x85]

The 2 major JIT assembler today are ASMJit, which uses Javascript as a prepass before C https://github.com/asmjit/asmjit/blob/ac77dfcd/tools/tablegen.js and Xbyak which uses C++ to generate C++ https://github.com/herumi/xbyak/blob/7dac9f61/gen/gen_code.cpp

And last example from the low-level world. This is how you could defined an instruction table for an emulator (https://github.com/mratsim/glyph/blob/8b278c5e/glyph/snes/opcodes.nim#L16-L85)

genOpcTable:
  op ADC: # Add with Carry
    0x61: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , DirectXIndirect
    0x63: cycles 4, {Ecc1_m16bit}                                       , StackRelative
    0x65: cycles 3, {Ecc1_m16bit, EccDirectLowNonZero}                  , Direct
    0x67: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , DirectIndirectLong
    0x69: cycles 2, {Ecc1_m16bit}                                       , Immediate
    0x6D: cycles 4, {Ecc1_m16bit}                                       , Absolute
    0x6F: cycles 5, {Ecc1_m16bit}                                       , AbsoluteLong
    0x71: cycles 5, {Ecc1_m16bit, EccDirectLowNonZero, EccCrossBoundary}, DirectIndirectY
    0x72: cycles 5, {Ecc1_m16bit, EccDirectLowNonZero}                  , DirectIndirect
    0x73: cycles 7, {Ecc1_m16bit}                                       , StackRelativeIndirectY
    0x75: cycles 4, {Ecc1_m16bit, EccDirectLowNonZero}                  , DirectX
    0x77: cycles 6, {Ecc1_m16bit, EccDirectLowNonZero}                  , DirectIndirectLongY
    0x79: cycles 4, {Ecc1_m16bit, EccCrossBoundary}                     , AbsoluteY
    0x7D: cycles 4, {Ecc1_m16bit, EccCrossBoundary}                     , AbsoluteX
    0x7F: cycles 5, {Ecc1_m16bit}                                       , AbsoluteLongX
    
    implementation:
      # ###################################################################################
      template adcImpl(sys: Sys, T: typedesc[uint8 or uint16], carry, overflow: var bool) =
        # Implement uint8 and uint16 mode
        
        template A {.dirty.} =
          # Alias for accumulator depending on mode
          when T is uint16: sys.cpu.regs.A
          else: sys.cpu.regs.A.lo
        
        func add(x, y: T, carry, overflow: var bool): T {.inline.} =
          # Add function helper
          # Carry edge cases on uint8:
          #   - x =   0, y =   0, P.carry = 0 --> result =   0, carry = 0
          #   - x = 255, y =   0, P.carry = 1 --> result =   0, carry = 1
          #   - x =   0, y = 255, P.carry = 1 --> result =   0, carry = 1
          #   - x = 127, y = 128, P.carry = 1 --> result =   0, carry = 1
          #   - x = 128, y = 127, P.carry = 1 --> result =   0, carry = 1
          #   - x = 255, y =   0, P.carry = 0 --> result = 255, carry = 0
          #   - x =   0, y = 255, P.carry = 0 --> result = 255, carry = 0
          #   - x = 127, y = 128, P.carry = 0 --> result = 255, carry = 0
          #   - x = 128, y = 127, P.carry = 0 --> result = 255, carry = 0
          result = x + y
          carry = carry or result < x
          overflow =  overflow or
                      not(result.isMsbSet xor x.isMsbSet) or
                      not(result.isMsbSet xor y.isMsbSet)
        
        # Fetch data.
        # `addressingMode` and `extraCycleCosts` are injected by "implementation"
        let val = sys.`addressingMode`(T, `extraCycleCosts`{.inject.})
        
        # Computation
        # TODO: Decimal mode
        A = add(A, val, carry, overflow)
        A = add(A, T(P.carry), carry, overflow)
      # ###################################################################################
      
      var carry, overflow = false
      
      if P.emulation_mode:
        sys.adcImpl(uint8, carry, overflow)
      else:
        sys.adcImpl(uint16, carry, overflow)
      
      # Sets the flags
      P.carry    = carry
      P.overflow = overflow
      P.negative = A.isMsbSet
      P.zero     = A == 0
  
  op AND: # AND Accumulator with memory
    ...

Alternative emulators either have Python, Ruby codegen as an intermediate step or are using hex directly or are very verbose.

And lastly, I doubt any static language besides Nim is able to write something like npeg (i.e. writing a custom parser without having to first compile the parser before use) or writing a compiler in macros to optimize linear algebra (https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core)

import npeg, strutils, tables

var words = initTable[string, int]()

let parser = peg "pairs":
  pairs <- pair * *(',' * pair) * !1
  word <- +Alpha
  number <- +Digit
  pair <- >word * '=' * >number:
    words[$1] = parseInt($2)

doAssert parser.match("one=1,two=2,three=3,four=4").ok
echo words # {"two": 2, "three": 3, "one": 1, "four": 4}

https://github.com/numforge/laser/tree/d1e6ae61/laser/lux_compiler/core

proc transpose(A: Function): Function =
  # Generator for the transposition function
  var i, j: Domain
  var B: Function
  
  # Definition of the result function
  B[j, i] = A[i, j]
  
  return B # Transposition function

is compiled transparently to

proc transpose(A: Tensor[float32]): Tensor[float32] =
  # Transposition kernel
  let dim0 = A.shape[0]
  let dim1 = A.shape[1]
  
  result = newTensor[float32](dim0, dim1)
  
  for j in 0 ..< dim1:
    for i in 0 ..< dim0:
      result[j, i] = A[i, j]

with a goal to add parallelism and SIMD vectorization support seamlessly while allowing the user to stay in with a representation closed to science.

Mirror of forum.nim-lang.org

5676 :: Practical examples showing how macros lead to better code?