Program was compiled with
nim -d:release c main.nim
It turned out that it is 5.4 times faster than program in Python, from which it was converted. I expected that speedup will be closer to 30-50. Can you please look at the program and guess, what can be made faster (Nim features, not algorithm).
import algorithm, future, math, tables
var g_counter = 0
type
Tfloat_seq = seq[float]
Tint_seq = seq[int]
Tpoint = Tfloat_seq
Tvector = Tfloat_seq #
Tvectors = seq[Tvector]
proc range(end0: int): Tint_seq =
result = @[]
for i in 0 .. end0 - 1:
add(result, i)
return result
proc range(start0, end0: int): Tint_seq =
result = @[]
for i in start0 .. end0 - 1:
add(result, i)
return result
proc init_seq[T](seq_len: int, val: T): seq[T] =
result = @[]
for i in 0 .. seq_len - 1:
add(result, val)
return result
proc nums[T](ll: seq[T]): Tint_seq =
result = @[]
for i in countup(low(ll), high(ll)):
add(result, i)
return result
proc last[T](ll: var seq[T]): var T =
return ll[high(ll)]
proc mul_list[T](ll: seq[T], koeff: T): seq[T] =
result = @[]
for elt in ll:
add(result, koeff * elt)
return result
proc count[T](seq: seq[T], val: T): int =
result = 0
for i in countup(low(seq), high(seq)):
if seq[i] == val:
inc(result)
return result
proc scalar_product[T](v0, v1: seq[T]): float =
result = 0.0
for i in low(v0) .. high(v0):
result += v0[i] * v1[i]
return result
proc num_to_list(n, len: int, base: int = 2): Tint_seq =
var
n0: int
subst_list: Tint_seq = @[]
n0 = n
for i in countup(0, base - 1):
add(subst_list, i)
result = @[]
for i in countup(0, len - 1):
add(result, subst_list[n0 mod base])
n0 = n0 div base
reverse(result)
return result
proc num3_to_list(n, len: int): Tint_seq =
return num_to_list(n, len, 3)
proc desc_sort_cmp(l0, l1: Tvector): int =
var l01 = reversed(mul_list(l0, -1.0))
var l11 = reversed(mul_list(l1, -1.0))
if l01 == l11:
result = 0
else:
for i in low(l01)..high(l01):
if l01[i] < l11[i]:
result = -1
break
elif l01[i] > l11[i]:
result = 1
break
return result
proc vectors_k(n, k: int): Tvectors =
result = @[]
var ll: Tint_seq
var vector: Tvector
for num in countup(0, 3^n - 1):
ll = num3_to_list(num, n)
if count(ll, 1) == n - k:
vector = @[]
for elt in ll:
add(vector, float(elt - 1))
if false:
var sum0 = 0.0
var weight = 1.0
for elt in vector:
sum0 += weight * elt
weight *= 2
add(result, vector)
sort(result, cmp = desc_sort_cmp)
return result
proc mcc_point(n: int): Tpoint =
var point: Tpoint = @[]
for i in range(n):
add(point, float(i + 1))
last(point) += 1/n
return point
const
c_empty = 0
c_expand = 1
c_add_minus = 2
c_size = 0
c_minus_count = 1
c_state = 2
type
Tformula_elt = seq[int]
Tformula = seq[Tformula_elt]
Tformulas_seq = seq[Tformula]
proc empty_state(): Tformula_elt =
result = @[0, 0, 0]
result[c_size] = 0
result[c_minus_count] = 0
result[c_state] = c_empty
return result
proc state_one(): Tformula_elt =
result = @[0, 0, 0]
result[c_size] = 1
result[c_minus_count] = 1
result[c_state] = c_expand
return result
proc rewrite_formula(formula: Tformula, n: int): Tformula =
result = @[]
var size_sum = 0
var size, minus_count: int
for f_elt in formula:
var state = f_elt[c_state]
if state == c_empty:
size = n - size_sum
minus_count = 0
else:
size = f_elt[c_size]
minus_count = f_elt[c_minus_count]
size_sum += size
if size != 0 or minus_count != 0:
add(result, @[size, minus_count])
return result
proc symm_region_formulas(n: int): Tformulas_seq =
#options = c_empty, c_expand, c_add_minus
var formula = @[empty_state()]
var iter = -1
result = @[]
var size_sum: int
while len(formula) > 0:
iter += 1
size_sum = 0
for d in formula:
size_sum += d[c_size]
if last(formula)[c_state] == c_empty:
add(result, rewrite_formula(formula, n))
discard pop(formula)
if false:
echo ""
echo formula
echo rewrite_formula(formula, n)
echo result
g_counter += 1
if g_counter == 10:
raise newException(Exception, "temp exception")
if len(formula) == 0: #
add(formula, state_one())
add(formula, empty_state())
else:
if size_sum + last(formula)[c_size] <= n:
var new_elt: Tformula_elt
deepCopy(new_elt, last(formula))
add(formula, new_elt)
add(formula, empty_state())
elif last(formula)[c_state] == c_expand:
if size_sum < n:
var d = addr(last(formula))
d[][c_size] += 1
d[][c_minus_count] = 1
d[][c_state] = c_add_minus
add(formula, empty_state())
else:
discard pop(formula)
elif last(formula)[c_state] == c_add_minus:
var d = addr(last(formula))
if d[][c_minus_count] < d[][c_size]:
d[][c_minus_count] += 1
add(formula, empty_state())
if d[][c_minus_count] == d[][c_size]:
d[][c_state] = c_expand
echo "len(result): ", len(result)
return result
proc formula_v_nums(formula: Tformula, vectors: Tvectors): Tint_seq =
var v_nums: Tint_seq = @[]
for v_num, v in vectors:
var is_found = true
var size_sum = 0
for f_elt in formula:
var size = f_elt[0]
var minus_count = f_elt[1]
var v_elt_minus_count = count(v[size_sum .. size_sum + size - 1], -1)
if v_elt_minus_count == 0 and minus_count == size:
discard
elif v_elt_minus_count != minus_count:
is_found = false
break
size_sum += size
if is_found:
add(v_nums, v_num)
return v_nums
proc perm_point(point: Tpoint, sign_perm, perm: Tint_seq): Tpoint =
var n = len(point)
result = @[]
for i in range(n):
add(result, point[perm[i]] * float(sign_perm[perm[i]]))
return result
proc bin_list(n: int, len0: int = -1): Tint_seq =
var n0 = n
var digits: Tint_seq = @[]
if n0 == 0:
add(digits, 0)
while n0 > 0:
var rem = n0 mod 2
add(digits, rem)
n0 = int(n0 / 2)
reverse(digits)
if len0 == -1:
return digits
else:
var zeros: Tint_seq = @[]
for i in range(len0 - len(digits)):
add(zeros, 0)
return zeros & digits
proc sign_perm_by_num(n, num: int): Tint_seq =
var ll = bin_list(num, n)
result = @[]
for elt in ll:
add(result, -2*elt + 1)
return result
proc perm_by_num(n, num: int): Tint_seq =
var num0 = num
var perm0 = range(n)
result = @[]
for i in range(n):
var pos = num0 div fac(n - i - 1)
add(result, perm0[pos])
delete(perm0, pos)
num0 -= pos * fac(n - i - 1)
return result
proc map_vector_num3(n: int, vectors: Tvectors, vectors_as_dict: Table[Tvector, int], v_num, sign_perm_num, perm_num: int): int =
var sign_perm0 = sign_perm_by_num(n, sign_perm_num)
var perm0 = range(n)
var v0 = perm_point(vectors[v_num], sign_perm0, perm0)
var v_num0 = vectors_as_dict[v0]
var sign_perm1 = init_seq(n, 1)
var perm1 = perm_by_num(n, perm_num)
var v1 = perm_point(vectors[v_num0], sign_perm1, perm1)
var v_num1 = vectors_as_dict[v1]
return v_num1
proc point_satisfies_formula(point: Tpoint, formula: Tformula): bool =
var min_sp = 0.0
var size_sum = 0
for formula_elt in formula:
var size = formula_elt[c_size]
var minus_count = formula_elt[c_minus_count]
var plus_count = size - minus_count
var p_part = point[size_sum .. size_sum + size - 1]
sort(p_part, system.cmp)
var min_sp_part = sum(p_part[0 .. plus_count - 1]) - sum(p_part[plus_count .. high(p_part)])
if minus_count == size:
min_sp_part = -abs(min_sp_part)
min_sp += min_sp_part
size_sum += size
return min_sp > 0
proc dig_vectors_random[T](point: Tpoint, arg_res: seq[T], try_count: int = 10_000): int =
var point0 = point
var n = len(point0)
var vectors = vectors_k(n, n)
var formulas = symm_region_formulas(n)
var f_v_nums_tuples = initTable[Tint_seq, int]()
var f_num = -1
while f_num < len(formulas) - 1:
f_num += 1
var formula = formulas[f_num]
echo formula
if f_num mod 100 == 0:
echo("f_num:", f_num)
var f_v_nums = formula_v_nums(formula, vectors)
echo f_v_nums
if hasKey(f_v_nums_tuples, f_v_nums): raise newException(Exception, "f_v_nums_tuples exception")
f_v_nums_tuples[f_v_nums] = f_num
echo("len(formulas):", len(formulas))
var v_sat_ar: Tint_seq = @[]
var v_sat_ar2 = init_seq(len(formulas), v_sat_ar)
for f_v_nums, f_num in pairs(f_v_nums_tuples):
var v_sat_ar = init_seq(len(vectors), 0)
for v_num in f_v_nums:
v_sat_ar[v_num] = 1
v_sat_ar2[f_num] = v_sat_ar
var rest_v_nums: Tint_seq = @[]
for v_num in nums(vectors):
if scalar_product(vectors[v_num], point0) > 0:
add(rest_v_nums, v_num)
var sign_perm_count = 2^n
var perm_count = fac(n)
var vectors_as_dict = initTable[Tvector, int]()
for v_num in nums(vectors):
vectors_as_dict[vectors[v_num]] = v_num
var iter = -1
while len(rest_v_nums) > 0:
iter += 1
echo()
echo("iter:", iter)
echo("arg_res:", arg_res)
echo("len(rest_v_nums):", len(rest_v_nums))
var min_rest_v_nums_len = 1_000_000_000
var min_rest_v_nums: Tint_seq = nil
var min_perm_point: Tpoint = nil
for try_num in range(try_count):
if try_num mod 10^3 == 0:
echo("try_num:", try_num)
var sign_perm_num = random(sign_perm_count)
var sign_perm = sign_perm_by_num(n, sign_perm_num)
var perm_num = random(perm_count)
var perm = perm_by_num(n, perm_num)
var perm_p = perm_point(point0, sign_perm, perm)
var perm_rest_v_nums: Tint_seq = @[]
for v_num in rest_v_nums:
add(perm_rest_v_nums, map_vector_num3(n, vectors, vectors_as_dict, v_num, sign_perm_num, perm_num))
for f_num, formula in formulas:
if not point_satisfies_formula(perm_p, formula):
continue
var perm_rest_v_nums1: Tint_seq = @[]
for v_num in perm_rest_v_nums:
if v_sat_ar2[f_num][v_num] == 0:
add(perm_rest_v_nums1, v_num)
if len(perm_rest_v_nums1) < min_rest_v_nums_len:
min_rest_v_nums = perm_rest_v_nums1
min_rest_v_nums_len = len(min_rest_v_nums)
min_perm_point = perm_p
rest_v_nums = min_rest_v_nums
point0 = min_perm_point
var iter_count = iter + 1
echo("iter_count:", iter_count)
return iter_count
proc dig_vectors_marathon(): void =
var arg_res = @[initTable[int, int](), initTable[int, int]()]
var start_n = 6
var end_n = 14
for n in range(start_n, end_n):
arg_res[0][n] = 1_000_000_000
arg_res[1][n] = 1_000_000_000
for try_count_num, try_count in [10^4, 10^5]:
for n in range(start_n, end_n):
var p = mcc_point(n)
var depth = dig_vectors_random(p, arg_res, try_count)
arg_res[try_count_num][n] = min(arg_res[try_count_num][n], depth)
randomize()
dig_vectors_marathon()
proc last[T](ll: var seq[T]): var T =
return ll[high(ll)]
is Tformula_elt only have three element? then you can use object, tuple, or array rather than seq[int] for example:
type
Tformula_elt = object
size, minus_count, state: int
template empty_state(): Tformula_elt = Tformula_elt(size:0, minus_count:0, state:0)
or
type
Tformula_elt = tuple[size, minus_count, state: int]
template empty_state(): Tformula_elt = (size:0, minus_count:0, state:0)
or
type
Tformula_elt = array[0..2, int]
template empty_state(): Tformula_elt = [0,0,0]
they will make your code a bit cleaner and faster
also compile with nim --checks:off -d:release c main.nim
Reuse seqs and other dynamic objects instead of allocating new ones all the time. Pre-set the size instead of adding element by element and thereby causing resizes.
Edit: Running it through valgrind --tool=callgrind and visualizing the result in kcachegrind shows that most time is spent in newSeq, newObj, adding and resizing: http://i.imgur.com/lXfaNyy.png
Thank you all.
I don't understand this: "Reuse seqs and other dynamic objects instead of allocating new ones all the time." How to reuse them? make global? Compile with "static"? Or something else?
EDIT: I think I need to pass them as "returned" or "in-out" parameter.
Okay, first of all: avoid creating seqs where you don't need them. An example are the range operations. While they have only minor impact on performance, many of their uses create completely avoidable overhead. Use 0..n-1 or 0..<n instead of range(n), for example. If you want to use range as an abstraction, make it an iterator. E.g.:
type IntIter = iterator(): int {.inline.}
template range(x: int): IntIter = 0..x-1
template range(x, y: int): IntIter = x..y-1
If you do actually need a sequence initialized with values from 0..n-1, simply use the toSeq template from sequtils. For example:
import sequtils
let n = 10
let r = toSeq(0..n-1)
Similarly, nums should be an iterator instead of a procedure that returns a seq. Note that these changes give you only a relatively minor benefit in performance, since in your code they are largely used outside of inner loops (where the biggest gains are).
Second, reusing them means: writing code so that already allocated seqs are being reused. This generally means adapting algorithms and may be non-trivial work. For example, you have the following line of code:
var v_elt_minus_count = count(v[size_sum .. size_sum + size - 1], -1)
The v[size_sum .. size_sum + size - 1] part creates a copy of a part of the seq that is being used, but not changed, and then thrown away immediately. Instead, you can create a variant of count that deals with a subset of indices and use that instead:
proc count[T](seq: seq[T], a, b: int, val: T): int =
result = 0
for i in countup(a, b):
if seq[i] == val:
inc(result)
...
proc formula_v_nums(formula: Tformula, vectors: Tvectors): Tint_seq =
...
var v_elt_minus_count = count(v, size_sum, size_sum + size - 1, -1)
...
When run with a smaller example of your code (end_n = 7), this change reduced the runtime cost by about 15% on my machine simply because fewer wasted allocations were done. Doing something similar for the line:
var min_sp_part = sum(p_part[0 .. plus_count - 1]) - sum(p_part[plus_count .. high(p_part)])
reduced runtime down to about 62% of the original runtime.
Another example is desc_sort_cmp, which does four unnecessary allocations and could be implemented entirely without any (though the performance impact is minor, I note it just as an example).
Third, seq assignment where the target is mutable is deep by default unless the sequence is marked as shallow or shallowCopy is being used. This is done to avoid aliasing problems (and a good default for correctness), but can affect performance. Examples:
Note that often these micro-optimizations may not buy you much; however, using let instead of var where possible is a good idea regardless of performance.
Fourth, a common idiom is to initialize a sequence to be empty and then repeatedly grow that by using add(). This may require several reallocations to initalize a sequence. You can avoid the overhead by using the following idiom instead:
result = newSeq[T](n)
for i in 0..n-1:
result[i] = f(i)
Note that unless this is being used in an inner loop, the above code may not actually buy you much. For example, it's common for the "empty sequence + add" idiom to occur in initialization sections that are only executed once (or a few times at most), creating negligible overhead.
There are other minor things that can improved, but these are some of the bigger ones.
for i in range(100)
than for i in 0..<100
(I'd rather write for i in 100
but well)Arrrrrrrrr: Actually range should be added to system.
I'm fine with that in principle, but in this case, range is a misnomer (plus, the identifier is already used for a type, e.g. range[0..9]). The name was chosen for Python because it actually depicts a sequence of values (even if, as in Python 3 or for xrange in Python 2, this collection is never actually reified). To describe an iterator from 0 to n-1, you want a name that fits better.