Hello,
How can I optimize the speed of the following proc:
import times, math
proc leibniz(terms: int): float =
var res = 0.0
for n in 0..terms:
res += pow(-1.0,float(n))/(2.0*float(n)+1.0)
return 4*res
let t0 = cpuTime()
echo(leibniz(100_000_000))
let t1 = cpuTime()
echo "Elapsed time: ", $(t1 - t0)
I have the following result in my computer: 3.141592663589326 Elapsed time: 8.23
This result is almost 5x faster than my CPython counter party, but on the other hand it is around 6x slower than Julia, given the following code:
function leibniz(terms)
res = 0.0
for i in 0:terms
res += (-1.0)^i/(2.0*i+1.0)
end
return res *= 4.0
end
println("Pi: ", @time leibniz(100_000_000))
1.374829 seconds (1.72 k allocations: 90.561 KiB) Pi: 3.141592663589326
Yet a bit faster:
for n in 0 .. terms shr 1:
let nn = n shl 1
res += 1.0/(2.0*float(nn)+1.0)
res -= 1.0/(2.0*float(nn)+3.0)
@alfrednewman
I tried to replicate your test. On my computer the following (slightly modified version) of the nim program and the julia program have the same runtime.
Whatever I do, I fail to run the julia version in less than 2 seconds (as you had). Are you sure that test went OK?
import times, math
proc `/`(a, b: int): float =
float(a) / float(b)
proc leibniz(terms: int): float =
for i in 0..terms:
result += (-1)^i / (2*i+1)
result *= 4.0
let
t0 = cpuTime()
pi = leibniz(100_000_000)
tt = cpuTime() - t0
echo("Elapsed time: ", tt)
echo("Pi: ", pi)
gives
>> nim c -d:release pi.nim
>> time ./pi
Elapsed time: 5.671875
Pi: 3.141592663589326
real 0m5.706s
user 0m5.672s
sys 0m0.000s
and
function leibniz(terms)
res = 0.0
for i in 0:terms
res += (-1.0)^i/(2.0*i+1.0)
end
return res * 4.0
end
println("Pi: ", @time leibniz(100_000_000))
gives
>> time julia pi.jl
5.770856 seconds (4.48 k allocations: 226.962 KB)
Pi: 3.141592663589326
real 0m6.538s
user 0m6.594s
sys 0m0.234s
I compiled the same approximate version to C with gcc optimisations on, and found the execution time to be roughly comparable between Nim and C. Could Julia's JIT be doing some sort of optimisation that shortcuts the full code somehow?
With the Nim version:
import times, math
proc leibniz(terms: int): float =
var res = 0.0
for n in 0..terms:
res += pow(-1.0,float(n))/(2.0*float(n)+1.0)
return 4*res
let t0 = cpuTime()
echo(leibniz(100_000_000))
let t1 = cpuTime()
echo "Elapsed time: ", $(t1 - t0)
I got these output times:
C:projectsNim>nim_version 3.141592663589326 Elapsed time: 6.541
C:projectsNim>nim_version 3.141592663589326 Elapsed time: 6.676
C:projectsNim>nim_version 3.141592663589326 Elapsed time: 6.594
While with the same C version:
#include <stdio.h>
#include <math.h>
#include <time.h>
double leibniz(int terms) {
double res = 0.0;
for (int i = 0; i < terms; ++i) {
res += pow(-1.0, (double)i) / (2.0 * (double)i + 1.0);
}
return 4*res;
}
int main() {
clock_t start = clock();
double x = leibniz(100000000);
printf("%.15f\n", x);
printf("Time elapsed: %f\n", ((double)clock() - start) / CLOCKS_PER_SEC);
}
The times taken were (EDIT: used -Ofast instead and got faster times):
C:projectsc>c_version 3.141592643589326 Time elapsed: 6.206000
C:projectsc>c_version 3.141592643589326 Time elapsed: 6.204000
C:projectsc>c_version 3.141592643589326 Time elapsed: 6.217000
I realise I actually got a slightly different decimal number with C, but to be honest I am not a C programmer so I am sure I did something wrong in the formatting.
Thank you all.
@wiffel/Nibbler, in my Julia test, I was using Pro version 0.6.1 64 bit running on Windows 10.
Now, running the same .jl code on a Windows 8 64 bit (i5 CPU 650 @ 3.20GHz, 3193 Mhz, 2 Cores, 4 Processors) I got the following:
2.763225 seconds (1.77 k allocations: 95.291 KiB)
Pi: 3.141592663589326
After JIT compilation:
1.863196 seconds (1.69 k allocations: 88.809 KiB)
Pi: 3.141592663589326
_
_ _ _(_)_ | A fresh approach to technical computing
(_) | (_) (_) | Documentation: https://docs.julialang.org
_ _ _| |_ __ _ | Type "?help" for help.
| | | | | | |/ _` | |
| | |_| | | | (_| | | Version 0.6.0 (2017-06-19 13:05 UTC)
_/ |\__'_|_|_|\__'_| | Official http://julialang.org/ release
|__/ | x86_64-w64-mingw32
julia> function leibniz(terms)
res = 0.0
for i in 0:terms
res += (-1.0)^i/(2.0*i+1.0)
end
return res * 4.0
end
leibniz (generic function with 1 method)
julia> println("Pi: ", @time leibniz(100_000_000))
1.652999 seconds (1.80 k allocations: 97.407 KiB)
Pi: 3.141592663589326
julia> @code_llvm leibniz(100_000_000)
; Function Attrs: uwtable
define double @julia_leibniz_61566(i64) #0 !dbg !5 {
top:
%1 = icmp sgt i64 %0, -1
%2 = select i1 %1, i64 %0, i64 -1
%3 = add i64 %2, 1
%4 = icmp eq i64 %3, 0
br i1 %4, label %L32, label %if.preheader
if.preheader: ; preds = %top
br label %if
if: ; preds = %if.preheader, %L22
%res.06 = phi double [ %13, %L22 ], [ 0.000000e+00, %if.preheader ]
%"#temp#.05" = phi i64 [ %9, %L22 ], [ 0, %if.preheader ]
%5 = sitofp i64 %"#temp#.05" to double
%6 = call double @llvm.pow.f64(double -1.000000e+00, double %5)
%7 = fadd double %5, -1.000000e+00
%notlhs = fcmp ord double %6, 0.000000e+00
%notrhs = fcmp uno double %7, 0.000000e+00
%8 = or i1 %notrhs, %notlhs
br i1 %8, label %L22, label %if2
L5.L32_crit_edge: ; preds = %L22
%phitmp = fmul double %13, 4.000000e+00
br label %L32
L32: ; preds = %L5.L32_crit_edge, %
top
%res.0.lcssa = phi double [ %phitmp, %L5.L32_crit_edge ], [ 0.000000e+00, %top
]
ret double %res.0.lcssa
if2: ; preds = %if
call void @jl_throw(i8** inttoptr (i64 55938608 to i8**))
unreachable
L22: ; preds = %if
%9 = add i64 %"#temp#.05", 1
%10 = fmul double %5, 2.000000e+00
%11 = fadd double %10, 1.000000e+00
%12 = fdiv double %6, %11
%13 = fadd double %res.06, %12
%14 = icmp eq i64 %9, %3
br i1 %14, label %L5.L32_crit_edge, label %if
}
julia> @code_native(leibniz(100_000_000))
Filename: REPL[1]
pushq %rbp
movq %rsp, %rbp
Source line: 3
pushq %rsi
pushq %rdi
pushq %rbx
subq $104, %rsp
movapd %xmm9, -48(%rbp)
movapd %xmm8, -64(%rbp)
movapd %xmm7, -80(%rbp)
movapd %xmm6, -96(%rbp)
cmpq $-2, %rcx
movq $-1, %rsi
cmovgq %rcx, %rsi
incq %rsi
xorpd %xmm6, %xmm6
je L191
xorpd %xmm6, %xmm6
xorl %edi, %edi
movabsq $333321392, %rax # imm = 0x13DE14B0
Source line: 699
movsd (%rax), %xmm8 # xmm8 = mem[0],zero
movabsq $pow, %rbx
movabsq $333321400, %rax # imm = 0x13DE14B8
movsd (%rax), %xmm9 # xmm9 = mem[0],zero
nopl (%rax,%rax)
Source line: 701
L112:
xorps %xmm7, %xmm7
cvtsi2sdq %rdi, %xmm7
Source line: 699
movapd %xmm8, %xmm0
movapd %xmm7, %xmm1
callq *%rbx
movapd %xmm7, %xmm1
addsd %xmm8, %xmm1
Source line: 300
ucomisd %xmm1, %xmm1
jp L152
ucomisd %xmm0, %xmm0
jp L222
Source line: 3
L152:
incq %rdi
Source line: 4
addsd %xmm7, %xmm7
addsd %xmm9, %xmm7
divsd %xmm7, %xmm0
addsd %xmm0, %xmm6
Source line: 3
cmpq %rdi, %rsi
jne L112
movabsq $333321408, %rax # imm = 0x13DE14C0
mulsd (%rax), %xmm6
Source line: 6
L191:
movapd %xmm6, %xmm0
movaps -96(%rbp), %xmm6
movaps -80(%rbp), %xmm7
movaps -64(%rbp), %xmm8
movaps -48(%rbp), %xmm9
addq $104, %rsp
popq %rbx
popq %rdi
popq %rsi
popq %rbp
retq
Source line: 300
L222:
movabsq $jl_throw, %rax
movl $55938608, %ecx # imm = 0x3558E30
callq *%rax
ud2
nopw %cs:(%rax,%rax)
compiled without optimizations: 10.697 seconds
compiled with -d:release: 7.253 seconds
import times
proc leibniz(terms: int): float =
var res = 0.0
for n in 0..terms:
res += (if n mod 2 == 0: 1.0 else: -1.0) / (2.0 * float(n) + 1.0)
return 4*res
let t0 = cpuTime()
echo(leibniz(100_000_000))
let t1 = cpuTime()
echo "Elapsed time: ", $(t1 - t0)
compiled without optimizations: 4.787 seconds
compiled with -d:release: 0.384 seconds
compiled without optimizations: 1.811 seconds
compiled with -d:release: 0.381 seconds
@zolern, thank you. Your code rocks.
However, we are cheating Julia because in her code there is a POW instead MOD.
I will modify / rerun the .jl script just to check the result.
@wiffel: It is true: last updates of LLVM and Clang among other things declared 5x faster pow execution. Nim can do nothing about this.
@LeuGim: Yes, in this particular case POW is not the best choice, but it is some kind worrying and unpleasant when your code depends on library function that is unexpectedly bad implemented.
@zolern:
My point was exactly that the library function is NOT to be considered as bad implemetned for not optimizing this case. Such optimizations come with cost (additional runtime checks), yet at least they have a cost of implementing them (their developers effort/time), so all possible optimizations cannot be done, library/compiler developers should choose more probable and more sane cases among all possible, to optimize them (say, -2 could also be optimized just to instantly return 4, and -2.5 to instantly return -6.25, ..., but there's infinity of numbers).
And this particular case (using POW for 1, -1, ...) is not of those worth both runtime checks and library developers effort. If the programmer considers efficiency (and readability too!) a little bit, why would he write it this way? So what point in optimizing it? GCC does better in this case.
Yet for such special cases special (if the programmer just likes writing this way) just-in-time optimizations can be made, like
proc pow(x: static[float], y: float): float = (when x == -1.0: float([1,-1][y.int mod 2]) else: math.pow(x, y))
(yet faster with template), or with term rewriting, smth like
template optPowMinusOne{pow(-1.0, x)}(x: float): float = float([1,-1][y.int mod 2])
(this didn't work for me though, may be someone can point what's wrong with it).
This is no longer Nim's problem, but just for fun, (short of importing intrinsics)
{.passc:"-march=native".}
import times, math
proc leibniz0(terms: int): float =
var res = 0.0
for n in 0..terms:
res += pow(-1,float(n))/(2*float(n)+1)
4*res
proc leibniz1(terms: int): float =
var res = 0.0
for n in 0..terms:
if (n and 1) == 0: res += 1/(2*float(n)+1)
else: res += -1/(2*float(n)+1)
4*res
proc leibniz2[N:static[int]](terms: int): float =
const
L = 1 shl N
L2 = L shl 1
T = 10
var t: array[L*T, float]
let r = (terms shr N) div T
if (terms mod (L*T)) != 0: quit 1
var res = 1/float(2*terms+1)
for n in 0..<r:
for j in 0..<T:
let jl = j*L
let nl = float(n)*float(L2)*float(T)+float(2*jl)
for i in 0..<L: t[jl+i] += 1/(nl+float(2*i+1))
for j in 1..<T:
for i in 0..<L:
t[i] += t[j*L+i]
for n in 0..<L shr 1: res += t[2*n]-t[2*n+1]
4*res
{.emit:"""
typedef int vi __attribute__ ((vector_size(16),may_alias));
typedef double vd __attribute__ ((vector_size(32),may_alias));
""".}
type
vi {.importc.} = object
vd {.importc.} = object
proc `+=`(x:ptr vd,y:float) =
{.emit:"*`x` += `y`;".}
proc `+=`(x:ptr vd,y:ptr vd) =
{.emit:"*`x` += *`y`;".}
proc inv(y:ptr vd) =
{.emit:"*`y` = 1.0 / *`y`;".}
proc leibniz3(terms: int): float =
const
L = 4
L2 = 2*L
var
t: array[L,float]
z: array[L,float]
let
tt = cast[ptr vd](addr t)
zz = cast[ptr vd](addr z)
let r = terms div L
if (terms mod L) != 0: quit 1
var res = 1/float(2*terms+1)
for n in 0..<r:
let nl = float(n*L2)
for i in 0..<L:
z[i] = float(2*i+1)
zz += nl
inv zz
tt += zz
for n in 0..<L shr 1: res += t[2*n]-t[2*n+1]
4*res
template r(f:untyped) =
const N = 1_000_000_000
let t = cpuTime()
let p = f N
let d = (cpuTime() - t)
echo asttostr(f), " ", p, " Elapsed time: ", d
r leibniz0
r leibniz1
r leibniz2[1]
r leibniz2[2]
r leibniz2[3]
r leibniz2[4]
r leibniz2[5]
r leibniz2[6]
r leibniz2[7]
r leibniz2[8]
r leibniz3
Results completely depend on your C compiler and your CPU. Note that I've changed the number to be one billion.
What baffles me is how slow this seems to be for everyone. I get some .7 seconds (plus or minus some random noise) for both Nim and Julia, both with clang, gcc-5, and gcc-7. Julia is a tick slower (around .78 seconds for Julia vs. .72 seconds for Nim), but nothing that breaks even the single second barrier. That's on a 2.5 GHz Core i7, a mid-2014 Mac, so how hardly a particularly powerful system.
This is with the code copied and pasted from the first post in this thread and no modifications applied.
Are you on Mac OS X by chance? We're calling c stdlib function for pow and they may be significantly faster.
Yes, but that wouldn't explain the faster Julia code (and if it did, it wouldn't explain the difference between Julia and Nim that others are observing).