nimforum mirror - Need advice in program optimization

DSblizzard (orginal) [2015-08-14T14:52:39+02:00] view original

Program was compiled with

nim -d:release c main.nim

It turned out that it is 5.4 times faster than program in Python, from which it was converted. I expected that speedup will be closer to 30-50. Can you please look at the program and guess, what can be made faster (Nim features, not algorithm).

import algorithm, future, math, tables


var g_counter = 0

type
  Tfloat_seq = seq[float]
  Tint_seq = seq[int]
  Tpoint = Tfloat_seq
  Tvector = Tfloat_seq #
  Tvectors = seq[Tvector]

proc range(end0: int): Tint_seq =
  result = @[]
  for i in 0 .. end0 - 1:
    add(result, i)
  return result

proc range(start0, end0: int): Tint_seq =
  result = @[]
  for i in start0 .. end0 - 1:
    add(result, i)
  return result

proc init_seq[T](seq_len: int, val: T): seq[T] =
  result = @[]
  for i in 0 .. seq_len - 1:
    add(result, val)
  return result

proc nums[T](ll: seq[T]): Tint_seq =
  result = @[]
  for i in countup(low(ll), high(ll)):
    add(result, i)
  return result

proc last[T](ll: var seq[T]): var T =
  return ll[high(ll)]

proc mul_list[T](ll: seq[T], koeff: T): seq[T] =
  result = @[]
  for elt in ll:
    add(result, koeff * elt)
  return result

proc count[T](seq: seq[T], val: T): int =
  result = 0
  for i in countup(low(seq), high(seq)):
    if seq[i] == val:
      inc(result)
  return result

proc scalar_product[T](v0, v1: seq[T]): float =
  result = 0.0
  for i in low(v0) .. high(v0):
    result += v0[i] * v1[i]
  return result

proc num_to_list(n, len: int, base: int = 2): Tint_seq =
  var
    n0: int
    subst_list: Tint_seq = @[]
  n0 = n
  for i in countup(0, base - 1):
    add(subst_list, i)
  result = @[]
  for i in countup(0, len - 1):
    add(result, subst_list[n0 mod base])
    n0 = n0 div base
  reverse(result)
  return result

proc num3_to_list(n, len: int): Tint_seq =
  return num_to_list(n, len, 3)

proc desc_sort_cmp(l0, l1: Tvector): int =
  var l01 = reversed(mul_list(l0, -1.0))
  var l11 = reversed(mul_list(l1, -1.0))
  if l01 == l11:
    result = 0
  else:
    for i in low(l01)..high(l01):
      if l01[i] < l11[i]:
        result = -1
        break
      elif l01[i] > l11[i]:
        result = 1
        break
  return result

proc vectors_k(n, k: int): Tvectors =
  result = @[]
  var ll: Tint_seq
  var vector: Tvector
  for num in countup(0, 3^n - 1):
    ll = num3_to_list(num, n)
    if count(ll, 1) == n - k:
      vector = @[]
      for elt in ll:
        add(vector, float(elt - 1))
      if false:
        var sum0 = 0.0
        var weight = 1.0
        for elt in vector:
          sum0 += weight * elt
          weight *= 2
      add(result, vector)
  sort(result, cmp = desc_sort_cmp)
  return result

proc mcc_point(n: int): Tpoint =
  var point: Tpoint = @[]
  for i in range(n):
    add(point, float(i + 1))
  last(point) += 1/n
  return point

const
  c_empty = 0
  c_expand = 1
  c_add_minus = 2
  c_size = 0
  c_minus_count = 1
  c_state = 2
type
  Tformula_elt = seq[int]
  Tformula = seq[Tformula_elt]
  Tformulas_seq = seq[Tformula]

proc empty_state(): Tformula_elt =
  result = @[0, 0, 0]
  result[c_size] = 0
  result[c_minus_count] = 0
  result[c_state] = c_empty
  return result

proc state_one(): Tformula_elt =
  result = @[0, 0, 0]
  result[c_size] = 1
  result[c_minus_count] = 1
  result[c_state] = c_expand
  return result

proc rewrite_formula(formula: Tformula, n: int): Tformula =
  result = @[]
  var size_sum = 0
  var size, minus_count: int
  for f_elt in formula:
    var state = f_elt[c_state]
    if state == c_empty:
      size = n - size_sum
      minus_count = 0
    else:
      size = f_elt[c_size]
      minus_count = f_elt[c_minus_count]
      size_sum += size
    if size != 0 or minus_count != 0:
      add(result, @[size, minus_count])
  return result

proc symm_region_formulas(n: int): Tformulas_seq =
  #options = c_empty, c_expand, c_add_minus
  var formula = @[empty_state()]
  var iter = -1
  result = @[]
  var size_sum: int
  while len(formula) > 0:
    iter += 1
    size_sum = 0
    for d in formula:
      size_sum += d[c_size]
    if last(formula)[c_state] == c_empty:
      add(result, rewrite_formula(formula, n))
      discard pop(formula)
      if false:
        echo ""
        echo formula
        echo rewrite_formula(formula, n)
        echo result
        g_counter += 1
        if g_counter == 10:
          raise newException(Exception, "temp exception")
      if len(formula) == 0: #
        add(formula, state_one())
        add(formula, empty_state())
      else:
        if size_sum + last(formula)[c_size] <= n:
          var new_elt: Tformula_elt
          deepCopy(new_elt, last(formula))
          add(formula, new_elt)
          add(formula, empty_state())
    elif last(formula)[c_state] == c_expand:
      if size_sum < n:
        var d = addr(last(formula))
        d[][c_size] += 1
        d[][c_minus_count] = 1
        d[][c_state] = c_add_minus
        add(formula, empty_state())
      else:
        discard pop(formula)
    elif last(formula)[c_state] == c_add_minus:
      var d = addr(last(formula))
      if d[][c_minus_count] < d[][c_size]:
        d[][c_minus_count] += 1
        add(formula, empty_state())
      if d[][c_minus_count] == d[][c_size]:
        d[][c_state] = c_expand
  echo "len(result): ", len(result)
  return result

proc formula_v_nums(formula: Tformula, vectors: Tvectors): Tint_seq =
  var v_nums: Tint_seq = @[]
  for v_num, v in vectors:
    var is_found = true
    var size_sum = 0
    for f_elt in formula:
      var size = f_elt[0]
      var minus_count = f_elt[1]
      var v_elt_minus_count = count(v[size_sum .. size_sum + size - 1], -1)
      if v_elt_minus_count == 0 and minus_count == size:
        discard
      elif v_elt_minus_count != minus_count:
        is_found = false
        break
      size_sum += size
    if is_found:
      add(v_nums, v_num)
  return v_nums

proc perm_point(point: Tpoint, sign_perm, perm: Tint_seq): Tpoint =
  var n = len(point)
  result = @[]
  for i in range(n):
    add(result, point[perm[i]] * float(sign_perm[perm[i]]))
  return result

proc bin_list(n: int, len0: int = -1): Tint_seq =
  var n0 = n
  var digits: Tint_seq = @[]
  if n0 == 0:
    add(digits, 0)
  while n0 > 0:
    var rem = n0 mod 2
    add(digits, rem)
    n0 = int(n0 / 2)
  reverse(digits)
  if len0 == -1:
    return digits
  else:
    var zeros: Tint_seq = @[]
    for i in range(len0 - len(digits)):
      add(zeros, 0)
    return zeros & digits

proc sign_perm_by_num(n, num: int): Tint_seq =
  var ll = bin_list(num, n)
  result = @[]
  for elt in ll:
    add(result, -2*elt + 1)
  return result

proc perm_by_num(n, num: int): Tint_seq =
  var num0 = num
  var perm0 = range(n)
  result = @[]
  for i in range(n):
    var pos = num0 div fac(n - i - 1)
    add(result, perm0[pos])
    delete(perm0, pos)
    num0 -= pos * fac(n - i - 1)
  return result

proc map_vector_num3(n: int, vectors: Tvectors, vectors_as_dict: Table[Tvector, int], v_num, sign_perm_num, perm_num: int): int =
  var sign_perm0 = sign_perm_by_num(n, sign_perm_num)
  var perm0 = range(n)
  var v0 = perm_point(vectors[v_num], sign_perm0, perm0)
  var v_num0 = vectors_as_dict[v0]
  var sign_perm1 = init_seq(n, 1)
  var perm1 = perm_by_num(n, perm_num)
  var v1 = perm_point(vectors[v_num0], sign_perm1, perm1)
  var v_num1 = vectors_as_dict[v1]
  return v_num1

proc point_satisfies_formula(point: Tpoint, formula: Tformula): bool =
  var min_sp = 0.0
  var size_sum = 0
  for formula_elt in formula:
    var size = formula_elt[c_size]
    var minus_count = formula_elt[c_minus_count]
    var plus_count = size - minus_count
    var p_part = point[size_sum .. size_sum + size - 1]
    sort(p_part, system.cmp)
    var min_sp_part = sum(p_part[0 .. plus_count - 1]) - sum(p_part[plus_count .. high(p_part)])
    if minus_count == size:
      min_sp_part = -abs(min_sp_part)
    min_sp += min_sp_part
    size_sum += size
  return min_sp > 0

proc dig_vectors_random[T](point: Tpoint, arg_res: seq[T], try_count: int = 10_000): int =
  var point0 = point
  var n = len(point0)
  var vectors = vectors_k(n, n)
  var formulas = symm_region_formulas(n)
  var f_v_nums_tuples = initTable[Tint_seq, int]()
  var f_num = -1
  while f_num < len(formulas) - 1:
    f_num += 1
    var formula = formulas[f_num]
    echo formula
    if f_num mod 100 == 0:
      echo("f_num:", f_num)
    var f_v_nums = formula_v_nums(formula, vectors)
    echo f_v_nums
    if hasKey(f_v_nums_tuples, f_v_nums): raise newException(Exception, "f_v_nums_tuples exception")
    f_v_nums_tuples[f_v_nums] = f_num
  echo("len(formulas):", len(formulas))
  var v_sat_ar: Tint_seq = @[]
  var v_sat_ar2 = init_seq(len(formulas), v_sat_ar)
  for f_v_nums, f_num in pairs(f_v_nums_tuples):
    var v_sat_ar = init_seq(len(vectors), 0)
    for v_num in f_v_nums:
      v_sat_ar[v_num] = 1
    v_sat_ar2[f_num] = v_sat_ar
  var rest_v_nums: Tint_seq = @[]
  for v_num in nums(vectors):
    if scalar_product(vectors[v_num], point0) > 0:
      add(rest_v_nums, v_num)
  var sign_perm_count = 2^n
  var perm_count = fac(n)
  var vectors_as_dict = initTable[Tvector, int]()
  for v_num in nums(vectors):
    vectors_as_dict[vectors[v_num]] = v_num
  var iter = -1
  while len(rest_v_nums) > 0:
    iter += 1
    echo()
    echo("iter:", iter)
    echo("arg_res:", arg_res)
    echo("len(rest_v_nums):", len(rest_v_nums))
    var min_rest_v_nums_len = 1_000_000_000
    var min_rest_v_nums: Tint_seq = nil
    var min_perm_point: Tpoint = nil
    for try_num in range(try_count):
      if try_num mod 10^3 == 0:
        echo("try_num:", try_num)
      var sign_perm_num = random(sign_perm_count)
      var sign_perm = sign_perm_by_num(n, sign_perm_num)
      var perm_num = random(perm_count)
      var perm = perm_by_num(n, perm_num)
      var perm_p = perm_point(point0, sign_perm, perm)
      var perm_rest_v_nums: Tint_seq = @[]
      for v_num in rest_v_nums:
        add(perm_rest_v_nums, map_vector_num3(n, vectors, vectors_as_dict, v_num, sign_perm_num, perm_num))
      for f_num, formula in formulas:
        if not point_satisfies_formula(perm_p, formula):
          continue
        var perm_rest_v_nums1: Tint_seq = @[]
        for v_num in perm_rest_v_nums:
          if v_sat_ar2[f_num][v_num] == 0:
            add(perm_rest_v_nums1, v_num)
        if len(perm_rest_v_nums1) < min_rest_v_nums_len:
          min_rest_v_nums = perm_rest_v_nums1
          min_rest_v_nums_len = len(min_rest_v_nums)
          min_perm_point = perm_p
    rest_v_nums = min_rest_v_nums
    point0 = min_perm_point
  var iter_count = iter + 1
  echo("iter_count:", iter_count)
  return iter_count

proc dig_vectors_marathon(): void =
  var arg_res = @[initTable[int, int](), initTable[int, int]()]
  var start_n = 6
  var end_n = 14
  for n in range(start_n, end_n):
    arg_res[0][n] = 1_000_000_000
    arg_res[1][n] = 1_000_000_000
  for try_count_num, try_count in [10^4, 10^5]:
    for n in range(start_n, end_n):
      var p = mcc_point(n)
      var depth = dig_vectors_random(p, arg_res, try_count)
      arg_res[try_count_num][n] = min(arg_res[try_count_num][n], depth)


randomize()
dig_vectors_marathon()

andrea (orginal) [2015-08-14T15:00:42+02:00] view original

It is pretty long, so I didn't read the whole thing, but some random comments:

there is no need to materialize all these ranges. You can iterate over a range with something like for i in 3 .. 35, so for i in range(x) should become for i in 0 .. < x

similar for proc nums etc.

this does not need mutability and could be a template

proc last[T](ll: var seq[T]): var T =
  return ll[high(ll)]

return result is redundant and may even create an additional copy (not sure)

you often create empty lists just to append on them. Sequences can be initialized with newSeq(len) so that they are already allocated of a big enough size, if you know it beforehand

jangko (orginal) [2015-08-14T16:00:35+02:00] view original

is Tformula_elt only have three element? then you can use object, tuple, or array rather than seq[int] for example:


type
  Tformula_elt = object
    size, minus_count, state: int

template empty_state(): Tformula_elt = Tformula_elt(size:0, minus_count:0, state:0)


type
  Tformula_elt = tuple[size, minus_count, state: int]

template empty_state(): Tformula_elt = (size:0, minus_count:0, state:0)


type
  Tformula_elt = array[0..2, int]

template empty_state(): Tformula_elt = [0,0,0]

they will make your code a bit cleaner and faster

also compile with nim --checks:off -d:release c main.nim

def (orginal) [2015-08-14T16:05:22+02:00] view original

Reuse seqs and other dynamic objects instead of allocating new ones all the time. Pre-set the size instead of adding element by element and thereby causing resizes.

Edit: Running it through valgrind --tool=callgrind and visualizing the result in kcachegrind shows that most time is spent in newSeq, newObj, adding and resizing: http://i.imgur.com/lXfaNyy.png

andrea (orginal) [2015-08-14T16:17:22+02:00] view original

Also, as a matter of style: it would be much easier to read if you defaulted to using let and only used var when something actually changes.

Jehan (orginal) [2015-08-14T19:23:51+02:00] view original

In general, if your program is dominated by operations that allocate memory, there's only so much speedup that you can gain. This is because even in an interpreter, the memory allocation procedures (that take up the bulk of CPU time in such a program) are written in C.

DSblizzard (orginal) [2015-08-15T18:21:53+02:00] view original

Thank you all.

I don't understand this: "Reuse seqs and other dynamic objects instead of allocating new ones all the time." How to reuse them? make global? Compile with "static"? Or something else?

EDIT: I think I need to pass them as "returned" or "in-out" parameter.

Jehan (orginal) [2015-08-15T22:55:07+02:00] view original

Okay, first of all: avoid creating seqs where you don't need them. An example are the range operations. While they have only minor impact on performance, many of their uses create completely avoidable overhead. Use 0..n-1 or 0..<n instead of range(n), for example. If you want to use range as an abstraction, make it an iterator. E.g.:

type IntIter = iterator(): int {.inline.}

template range(x: int): IntIter = 0..x-1
template range(x, y: int): IntIter = x..y-1

If you do actually need a sequence initialized with values from 0..n-1, simply use the toSeq template from sequtils. For example:

import sequtils
let n = 10
let r = toSeq(0..n-1)

Similarly, nums should be an iterator instead of a procedure that returns a seq. Note that these changes give you only a relatively minor benefit in performance, since in your code they are largely used outside of inner loops (where the biggest gains are).

Second, reusing them means: writing code so that already allocated seqs are being reused. This generally means adapting algorithms and may be non-trivial work. For example, you have the following line of code:

var v_elt_minus_count = count(v[size_sum .. size_sum + size - 1], -1)

The v[size_sum .. size_sum + size - 1] part creates a copy of a part of the seq that is being used, but not changed, and then thrown away immediately. Instead, you can create a variant of count that deals with a subset of indices and use that instead:

proc count[T](seq: seq[T], a, b: int, val: T): int =
  result = 0
  for i in countup(a, b):
    if seq[i] == val:
      inc(result)

...

proc formula_v_nums(formula: Tformula, vectors: Tvectors): Tint_seq =
      ...
      var v_elt_minus_count = count(v, size_sum, size_sum + size - 1, -1)
      ...

When run with a smaller example of your code (end_n = 7), this change reduced the runtime cost by about 15% on my machine simply because fewer wasted allocations were done. Doing something similar for the line:

var min_sp_part = sum(p_part[0 .. plus_count - 1]) - sum(p_part[plus_count .. high(p_part)])

reduced runtime down to about 62% of the original runtime.

Another example is desc_sort_cmp, which does four unnecessary allocations and could be implemented entirely without any (though the performance impact is minor, I note it just as an example).

Third, seq assignment where the target is mutable is deep by default unless the sequence is marked as shallow or shallowCopy is being used. This is done to avoid aliasing problems (and a good default for correctness), but can affect performance. Examples:

Use let instead of var where you do not further need to modify the sequence.

Using add(seqOfSeqs, otherSeq) will make a copy of otherSeq. You can avoid this by calling shallow(otherSeq) beforehand if you're positive that you won't need the default deep copying semantics.

The same goes for seqOfSeqs[i] = otherSeq. You can also use shallowCopy seqOfSeqs[i], otherSeq to improve performance.

Note that often these micro-optimizations may not buy you much; however, using let instead of var where possible is a good idea regardless of performance.

Fourth, a common idiom is to initialize a sequence to be empty and then repeatedly grow that by using add(). This may require several reallocations to initalize a sequence. You can avoid the overhead by using the following idiom instead:

result = newSeq[T](n)
for i in 0..n-1:
  result[i] = f(i)

Note that unless this is being used in an inner loop, the above code may not actually buy you much. For example, it's common for the "empty sequence + add" idiom to occur in initialization sections that are only executed once (or a few times at most), creating negligible overhead.

There are other minor things that can improved, but these are some of the bigger ones.

Arrrrrrrrr (orginal) [2015-08-15T23:21:42+02:00] view original

Actually range should be added to system. It is more readable

for i in range(100)

than

for i in 0..<100

(I'd rather write

for i in 100

but well)

Jehan (orginal) [2015-08-16T00:30:32+02:00] view original

Arrrrrrrrr: Actually range should be added to system.

I'm fine with that in principle, but in this case, range is a misnomer (plus, the identifier is already used for a type, e.g. range[0..9]). The name was chosen for Python because it actually depicts a sequence of values (even if, as in Python 3 or for xrange in Python 2, this collection is never actually reified). To describe an iterator from 0 to n-1, you want a name that fits better.

Mirror of forum.nim-lang.org

1539 :: Need advice in program optimization