I'm working on https://github.com/nim-lang/Nim/pull/20897 which replaces the dispatcher using subtype checking with the vtable dispatcher. But in the benchmark https://gist.github.com/ringabout/0752a22b2c85e737a66314bb61a470e6, they behave as good as each other.
The old dispatcher gains performance boost from https://github.com/nim-lang/Nim/pull/20781, which optimizes the subtype checking to an O(1) operation. It also uses direct call, while the vtable implementation uses indirect call.
Are there other factors which affect the performance of the vtable implementation?
The c code of the vtable implementation
static N_INLINE(void*, nimGetVTable)(void* p__0kHUBDcXPgMHMI8U9cXW5CQ, NI index__P0r4EXn3O3JcFkHGB9caIWg) {
void* result;
result = (void*)0;
result = (*(*((TNimTypeV2**) (p__0kHUBDcXPgMHMI8U9cXW5CQ)))).vTable[index__P0r4EXn3O3JcFkHGB9caIWg];
return result;
}
N_LIB_PRIVATE N_NIMCALL(void, bar__test_37)(tyObject_AcolonObjectType___FPPdkqOrYRgd8wXaW5PMrA* x__WXRew1WllMKXFDEhSGiRFw) {
NIM_BOOL* nimErr_;
{nimErr_ = nimErrorFlag();
{
if (!(x__WXRew1WllMKXFDEhSGiRFw == 0)) goto LA3_;
chckNilDisp(x__WXRew1WllMKXFDEhSGiRFw);
}
goto LA1_;
LA3_: ;
{
void* T6_;
T6_ = (void*)0;
T6_ = nimGetVTable(x__WXRew1WllMKXFDEhSGiRFw, 0);
((tyProc__k3wrRs2usSIRSHGJl0kTNQ) (T6_))(x__WXRew1WllMKXFDEhSGiRFw);
if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
}
LA1_: ;
}BeforeRet_: ;
}
The c code of the old implementation
N_LIB_PRIVATE N_NIMCALL(void, bar__test_37)(tyObject_AcolonObjectType___FPPdkqOrYRgd8wXaW5PMrA* x__WXRew1WllMKXFDEhSGiRFw) {
NIM_BOOL* nimErr_;
{nimErr_ = nimErrorFlag();
chckNilDisp(x__WXRew1WllMKXFDEhSGiRFw);
{
if (!((x__WXRew1WllMKXFDEhSGiRFw) && (isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 17, 796740352)))) goto LA3_;
if (x__WXRew1WllMKXFDEhSGiRFw && !isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 17, 796740352)){ raiseObjectConversionError(); goto BeforeRet_;
}
bar__test_68(((tyObject_A16colonObjectType___Ci9a9aS6PLwJzUVRMIkKuZyw*) (x__WXRew1WllMKXFDEhSGiRFw)));
if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
}
goto LA1_;
LA3_: ;
{
if (!((x__WXRew1WllMKXFDEhSGiRFw) && (isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 16, 2169393664)))) goto LA6_;
if (x__WXRew1WllMKXFDEhSGiRFw && !isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 16, 2169393664)){ raiseObjectConversionError(); goto BeforeRet_;
}
bar__test_66(((tyObject_A15colonObjectType___SIFOUqo7KT6mWGYo09clVRg*) (x__WXRew1WllMKXFDEhSGiRFw)));
if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
}
goto LA1_;
LA6_: ;
{
if (!((x__WXRew1WllMKXFDEhSGiRFw) && (isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 15, 2733143808)))) goto LA9_;
if (x__WXRew1WllMKXFDEhSGiRFw && !isObjDisplayCheck((*x__WXRew1WllMKXFDEhSGiRFw).Sup.m_type, 15, 2733143808)){ raiseObjectConversionError(); goto BeforeRet_;
}
bar__test_64(((tyObject_A14colonObjectType___VqLodxpzWtaIyT4TUFQvfg*) (x__WXRew1WllMKXFDEhSGiRFw)));
if (NIM_UNLIKELY(*nimErr_)) goto BeforeRet_;
}
It seems that the vtable implementation works great when there are 50 object types with 5 methods and 100 instances ref 50 object types with 5 methods and 100 instances
vtable: nim c -r -d:release
dummy value: 34803480
time: 0.005494600000000001
the old implementation:
dummy value: 34803480
time: 0.0237947
This is awesome!
Asides from the old dispatch trees being too slow, the main reason I can't use methods on embedded platforms is because the RTTI eats up too much RAM. If the RTTI and the vTable were generated as const in the C code then this wouldn't be a problem.
Is that something that could be dealt with in this PR or would it have to be done separately?
The vtable might be stored in a global static array in a WIP alternative PR => https://github.com/nim-lang/Nim/pull/21343
I'm not sure about the RTTI part.