nim
import std/strutils
import std/sequtils
type
tkname = enum
def, id, str, num, kw, uk, op
token = object
label: tkName
value: string
method reset(tk: var token) {.base.} =
tk.label = def
tk.value = ""
method toStr(tk: token): string {.base.} =
var res: string
for ch in tk.value:
res.add(ch)
return res
proc newTk(): token =
var res: token
return res
proc ignore*[T](value: T) =
discard
proc validIndex*[arrsize, arrtype](arr: array[arrsize, arrtype], index: int): bool =
try:
ignore[arrtype](arr[index])
return true
except IndexDefect:
return false
const keywords = ["let", "const", "var", "if", "else"]
const decKeywords: array[3, string] = ["let", "const", "var"]
const operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--"]
const operatorStart = ['=','+','-','/', '*', '^']
let compilationTarget = readFile("./ex/example.txt")
var
tokens: seq[token]
currentToken: token
tkIdx: Natural = 0
currentToken.label = def
echo compilationTarget
for ch in compilationTarget:
if currentToken.label == def:
case ch
of Whitespace:
continue
of '\"':
currentToken.label = str
continue
of Digits:
currentToken.label = num
currentToken.value = $ch
of IdentStartChars:
currentToken.label = id
currentToken.value = $ch
of operatorStart:
currentToken.label = uk
currentToken.value = $ch
else:
continue
continue
case currentToken.label
of def:
break
of str:
if ch == '\"':
echo int tkIdx
let throwaway = @[currentToken]
tokens = tokens & throwaway
currentToken.reset()
tkIdx += 1
else:
currentToken.value.add(ch)
of id:
if keywords.contains(currentToken.value):
currentToken.label = kw
echo int tkIdx
let throwaway = @[currentToken]
tokens = tokens & throwaway
currentToken.reset()
tkIdx += 1
if Whitespace.contains(ch):
echo int tkIdx
let throwaway = @[currentToken]
tokens = tokens & throwaway
currentToken.reset()
tkIdx += 1
else:
currentToken.value.add(ch)
of num:
if Whitespace.contains(ch):
echo int tkIdx
let throwaway = @[currentToken]
tokens = tokens & throwaway
currentToken.reset()
tkIdx += 1
else:
currentToken.value.add(ch)
of uk:
if Whitespace.contains(ch) or IdentStartChars.contains(ch):
if operators.contains(currentToken.value):
# Stops here.
echo int tkIdx
currentToken.label = op
tokens.add(currentToken)
currentToken.value = ""
currentToken.label = def
tkIdx += 1
else:
currentToken.value.add(ch)
of kw:
currentToken.label = def
of op:
currentToken.label = def
else:
continue
for cTk in tokens:
if cTk.label != def:
echo cTk
i tend to do something along these lines:
import std/[strutils,parseutils]
type
TokKind = enum
def, id, str, num, kw, uk, op
Token = object
label: TokKind
value: string
const
keywords = ["let", "const", "var", "if", "else"]
defKeywords: array[3, string] = ["let", "const", "var"]
operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--"]
operatorStart = {'=','+','-','/','*','^',}
let compilationTarget = readFile("/tmp/lex/example.txt")
var tokens: seq[Token]
echo compilationTarget
var idx = compilationTarget.skipWhitespace()
while idx < compilationTarget.len:
proc parseUntil(label: TokKind, until: set[char]|char):Token =
result.label = label
idx += compilationTarget.parseUntil(result.value, until, start = idx)
proc parseWhile(label: TokKind, validChars: set[char]):Token =
result.label = label
idx += compilationTarget.parseWhile(result.value, validChars, start = idx)
tokens.add case compilationTarget[idx]
of '\"':
str.parseUntil('\"')
of Digits:
num.parseWhile(Digits)
#plus {'.','e'} maybe, or parseuntil whitespace, sure why not
#i guess you're tokenizing `-1` as two tokens
of operatorStart:
op.parseWhile(operatorStart)
# ok this isn't quite right but you get the idea
of IdentStartChars:
var tmp = id.parseWhile(IdentChars)
if tmp.value in keywords:
tmp.label = kw
tmp
else:#unknown? is that what 'uk' means?
let tmp = Token(label: uk, value: $compilationTarget[idx])
inc idx
tmp
idx += compilationTarget.skipWhitespace(idx)
for cTk in tokens:
echo cTk
Check std/lexbase.
Or even better, use my toktok package, a generic tokenizer based on Nim macros, lexbase and strutils.
Basic example:
import toktok
static:
Program.settings(uppercase = true, prefix = "TK_") # limitation: must start with TK_
tokens:
Plus > '+'
Minus > '-'
Divide > '/':
Block_Comment ? '*' .. "*/"
Inline_Comment ? '/' .. EOL
Assign > '='
Alt_Comment > '#' .. EOL
Let > "let"
Const > "const"
True > {"TRUE", "True", "true"}
False > {"FALSE", "False", "false"}
# sample.txt
let user = "john"
Use getToken() to parse each token. This proc returns a TokenTuple:
(kind: TK_LET, value: "let", wsno: 0, line: 1, col: 0, pos: 0)
(kind: TK_IDENTIFIER, value: "user", wsno: 1, line: 1, col: 4, pos: 4)
(kind: TK_ASSIGN, value: "", wsno: 0, line: 1, col: 9, pos: 9)
(kind: TK_STRING, value: "john", wsno: 1, line: 1, col: 11, pos: 11)
Full example of toktok in action, implementing a basic parser and AST handler https://github.com/openpeep/toktok/blob/main/examples/program.nim
Hi, I have a quick question about BaseLexer open method in std/lexbase, that given your experience on the topic you may have an answer for.
It seems that the lexer is optimized for line based parsing? What is the best idiomatic way to parse a language that can ignore new lines? Currently for testing purpose my workaround is to do something like
lexbase.open(self, input, 999999)
where 999999 is an arbitrarily large number for the bufLen in
proc open(L: var BaseLexer; input: Stream; bufLen: int = 8192;
refillChars: set[char] = NewLines) {.....}
but this is quite unreliable and probably not very efficient.
Why ignore whitespaces/newlines ? You need this kind of info. What you want is to skip tokenizing the char itself but at the same time will store whitespace and line number in a tuple, for each getToken call
TokenTuple* = tuple[kind: TokenKind, value: string, wsno, line, column: int]
Here is an example using only std libraries. You'll need a lexer, a parser and ast representation. Now let's make this shine!
# lexer.nim
import std/[lexbase, streams]
from std/strutils import Whitespace, `%`
type
TokenKind* = enum
tkUnknown
tkIdent
tkColon
tkSemicolon
tkLet
tkConst
tkBool
tkString
tkAssign
tkEof
TokenTuple* = tuple[kind: TokenKind, value: string, wsno, line, column: int]
Lexer* = object of BaseLexer
kind: TokenKind
token: string
startPos, wsno: int
multiLineStr: bool
error: string
proc setError*(lex: var Lexer; message: string) =
lex.kind = tkUnknown
if lex.error.len == 0:
lex.error = message
proc hasError*(lex: Lexer): bool {.inline.} =
result = lex.error.len != 0
proc getError*(lex: Lexer): string {.inline.} =
result = "($1:$2) $3" % [$lex.lineNumber, $(lex.startPos), lex.error]
# defer
proc handleIdentifier(lex: var Lexer)
proc handleSpecial(lex: var Lexer): char
proc handleString(lex: var Lexer)
proc handleNewLine(lex: var Lexer) =
case lex.buf[lex.bufpos]
of '\c': lex.bufpos = lex.handleCR(lex.bufpos)
of '\n': lex.bufpos = lex.handleLF(lex.bufpos)
else: discard
proc skip(lex: var Lexer) =
var wsno: int
while true:
case lex.buf[lex.bufpos]
of Whitespace:
if lex.buf[lex.bufpos] in NewLines:
lex.handleNewLine()
else:
inc lex.bufpos
inc wsno
else:
lex.wsno = wsno
return
proc skip(lex: var Lexer, keepLastWhitespace: bool) =
var wsno: int
while true:
case lex.buf[lex.bufpos]
of Whitespace:
if lex.buf[lex.bufpos] in NewLines:
lex.handleNewLine()
else:
try:
if lex.buf[lex.bufpos + 1] notin Whitespace and
lex.buf[lex.bufpos + 1] notin NewLines:
lex.wsno = wsno
break
except IndexDefect: discard
inc lex.bufpos
inc wsno
else:
lex.wsno = wsno
break
proc setToken(lex: var Lexer, kind: TokenKind, offset = 1) =
lex.startPos = lex.getColNumber(lex.bufpos)
lex.kind = kind
inc lex.bufpos, offset
proc getToken*(lex: var Lexer): TokenTuple =
lex.startPos = lex.getColNumber(lex.bufpos)
setLen(lex.token, 0) # reset token value
skip lex
case lex.buf[lex.bufpos]:
of '=':
lex.setToken(tkAssign)
of ';':
lex.setToken(tkSemicolon)
of ':':
lex.setToken(tkColon)
of '"':
lex.handleString()
of {'a' .. 'z', 'A' .. 'Z', '_'}:
lex.handleIdentifier()
of EndOfFile:
lex.startPos = lex.getColNumber(lex.bufpos)
lex.kind = tkEof
else:
lex.setToken(tkUnknown)
(
kind: lex.kind,
value: lex.token,
wsno: lex.wsno,
line: lex.lineNumber,
column: lex.startPos
)
proc handleIdentifier(lex: var Lexer) =
lex.startPos = lex.getColNumber(lex.bufpos)
setLen(lex.token, 0)
while true:
if lex.buf[lex.bufpos] in {'a'..'z', 'A'..'Z', '_'}:
add lex.token, lex.buf[lex.bufpos]
inc lex.bufpos
else:
dec lex.bufpos
break
case lex.token:
of "let":
lex.setToken(tkLet)
of "const":
lex.setToken(tkConst)
of "true", "false":
lex.setToken(tkBool)
else:
lex.setToken(tkIdent)
proc handleSpecial(lex: var Lexer): char =
inc lex.bufpos
case lex.buf[lex.bufpos]
of 'n':
lex.token.add "\\n"
result = '\n'
inc lex.bufpos
of '\\':
lex.token.add "\\\\"
result = '\\'
inc lex.bufpos
else:
lex.setError("Unknown escape sequence: '\\" & lex.buf[lex.bufpos] & "'")
result = '\0'
proc handleString(lex: var Lexer) =
lex.startPos = lex.getColNumber(lex.bufpos)
setLen(lex.token, 0)
let lineno = lex.lineNumber
inc lex.bufpos
while true:
case lex.buf[lex.bufpos]
of '\\':
discard lex.handleSpecial()
if lex.hasError(): return
of '"':
lex.kind = tkString
inc lex.bufpos
break
of NewLines:
if lex.multiLineStr:
lex.skip(keepLastWhitespace = true)
else:
lex.setError("EOL reached before end of string")
return
of EndOfFile:
lex.setError("EOF reached before end of string")
return
else:
add lex.token, lex.buf[lex.bufpos]
inc lex.bufpos
if lex.multiLineStr:
lex.lineNumber = lineno
# ast.nim
from std/strutils import parseBool, parseInt
when not defined release:
import std/[json, jsonutils]
type
NodeType = enum
NTLet
NTConst
NTFunction
NTStmtList
NTBool
NTString
Node* = ref object
case nodeType: NodeType
of NTBool:
bVal: bool
of NTString:
sVal: string
of NTLet:
letIdent: string
letValue: Node
of NTConst:
constIdent: string
constValue: Node
of NTFunction:
fnIdent: string
fnParams: seq[Node]
fnBody: Node # NTStmtList
of NTStmtList:
stmtList: seq[Node]
Program* = object
nodes: seq[Node]
proc `&=`*(program: var Program, node: Node) =
program.nodes.add node
proc newBool*(value: string): Node =
Node(nodeType: NTBool, bVal: parseBool(value))
proc newString*(value: string): Node =
Node(nodeType: NTString, sVal: value)
proc newLet*(ident: string, value: Node): Node =
Node(nodeType: NTLet, letIdent: ident, letValue: value)
proc newConst*(ident: string, value: Node): Node =
Node(nodeType: NTConst, constIdent: ident, constValue: value)
when not defined release:
proc `$`*(node: Program, indent = 2): string =
pretty toJson(node), indent
# parser.nim
import ./ast
export ast
include ./lexer
type
Parser = object
lex*: Lexer
error: string
prev*, current*, next*: TokenTuple
program: Program
PrefixFunction = proc(p: var Parser): Node
proc setError(p: var Parser, msg: string) =
p.error = "Error ($2:$3): $1" % [msg, $p.current.line, $p.current.column]
proc hasError*(p: var Parser): bool {.inline.} =
p.error.len != 0
proc getError*(p: var Parser): string =
result = "($1:$2) $3" % [$p.current.line, $p.current.column, p.error]
proc walk(p: var Parser, offset = 1) =
var i = 0
while offset != i:
p.prev = p.current
p.current = p.next
p.next = p.lex.getToken()
inc i
proc parseVarDefinition(p: var Parser): Node =
let this = p.current
walk p
let identName = p.current.value
if p.next.kind != tkAssign:
p.setError("Missing assignment")
return
walk p, 2
var identValue: Node = case p.current.kind:
of tkString: newString(p.current.value)
of tkBool: newBool(p.current.value)
else:
p.setError("Invalid value assignment")
nil
if this.kind == tkLet:
result = newLet(identName, identValue)
elif this.kind == tkConst:
result = newConst(identName, identValue)
if p.next.kind != tkSemicolon:
p.setError("Missing semicolon")
return
walk p, 2
proc unexpected(p: var Parser): Node =
p.setError("Unexpected token \"$1\"" % [
if p.current.value.len != 0:
p.current.value
else:
$p.current.kind
])
proc getPrefixFn(p: var Parser, kind: TokenKind): PrefixFunction =
result = case kind:
of tkLet, tkConst:
parseVarDefinition
else: unexpected
proc parseExp(p: var Parser): Node =
let this = p.current
let callPrefixFn = p.getPrefixFn(this.kind)
let exp: Node = p.callPrefixFn()
if exp != nil:
result = exp
proc parseProgram*(p: var Parser) =
while p.current.kind != tkEof and (p.hasError == false and p.lex.hasError == false):
let node = p.parseExp()
if node != nil:
p.program &= node
proc getAst*(p: var Parser): Program =
result = p.program
proc close*(p: var Parser) =
p.lex.close()
proc newParser*(fileContents: string): Parser =
result = Parser(
lex: Lexer(
kind: tkUnknown,
startPos: 0,
token: "",
error: ""
),
program: Program()
)
result.lex.open(newStringStream(fileContents))
result.current = result.lex.getToken()
result.next = result.lex.getToken()
Main file.
import ./parser
when isMainModule:
var p = newParser(readFile("sample.txt"))
p.parseProgram()
if p.lex.hasError:
echo p.lex.getError()
elif p.hasError:
echo p.getError()
else:
when not defined release:
echo p.getAst()
else: discard
p.close()
your sample.txt
let a = true; let b = "who"; const c = false;
Tokens:
(kind: tkLet, value: "let", wsno: 0, line: 1, column: 2)
(kind: tkIdent, value: "a", wsno: 1, line: 1, column: 4)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 6)
(kind: tkBool, value: "true", wsno: 1, line: 1, column: 11)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 12)
(kind: tkLet, value: "let", wsno: 1, line: 1, column: 16)
(kind: tkIdent, value: "b", wsno: 1, line: 1, column: 18)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 20)
(kind: tkString, value: "who", wsno: 1, line: 1, column: 22)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 27)
(kind: tkConst, value: "const", wsno: 1, line: 1, column: 33)
(kind: tkIdent, value: "c", wsno: 1, line: 1, column: 35)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 37)
(kind: tkBool, value: "false", wsno: 1, line: 1, column: 43)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 44)
AST Output:
{
"nodes": [
{
"nodeType": 0,
"letIdent": "a",
"letValue": {
"nodeType": 4,
"bVal": true
}
},
{
"nodeType": 0,
"letIdent": "b",
"letValue": {
"nodeType": 5,
"sVal": "who"
}
},
{
"nodeType": 1,
"constIdent": "c",
"constValue": {
"nodeType": 4,
"bVal": false
}
}
]
}
Rock n roll!
Thanks for the extensive example, I will have a look in detail.
Maybe, though, my problem was not too clear. I have no problem in ignoring the new lines or white spaces. My issue is that when using the open method of BaseLexer with the default BufLen and I try to parse a file where all the code is in a single line longer than the BufLen the program crashes.
I can see in your example you are also using result.lex.open(newStringStream(fileContents)) with default BufLen, will it work if your test file has more than 8192 characters in one line?
Ah, that makes sense now. What about len(fileContents)?
Anyway, for really big files there is std/memfiles
may have to dig deeper in the BaseLexer source, but I think the problem is not about reading a large file, but the fact that the lexer buffers one line at the time and has this BufLen limit instead of having something like a moving window or similar. My assumption is that this works well for line based languages (e.g. nim itself), but in a language where going to a new line is optional it will fail.
setting the buffer length to len(fileContents) is a quick solution, but I fear may not be very efficient (e.g. extremely large file with normal length lines), hence my questions if there is a different more idiomatic way to solve this problem. It may be that the base lexer is indeed optimized for line based languages and may need a completely different tool, or maybe there is some option I am not aware of to bypass this limitation, seems anyway an interesting use case.
lexbase is not "line based", use the refillChars optional parameter:
proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
refillChars: set[char] = NewLines)
You need to call handleRefillChar for every char that is in your refillChars set. But using a memory mapped file instead should be way easier and faster too. But lexbase works on embedded devices without an MMU.
I've made a new lexer, since I had a hard time making a parser. Here it it:
import std/strutils
import std/os
let file = readFile(paramStr(1))
type
tokenName = enum
default, unknown, ident, str, integer, double, keyword, operator, seperator
token = object
label: tokenName
value: string
abstNode = object
head: token
body: seq[abst]
abst = ref abstNode
let equivParams = [
(name: "none", equiv_fn: proc (x: token): bool =
return [default, unknown].contains(x.label)
),
(name: "identifier", equiv_fn: proc (x: token): bool =
return x.label == ident
),
(name: "type", equiv_fn: proc (x: token): bool =
return ["int", "char", "string"].contains(x.value)
),
(name: "=", equiv_fn: proc (x: token): bool =
return x.label == operator and x.value == "="
),
(name: ":", equiv_fn: proc (x: token): bool =
return x.label == seperator and x.value == ":"
)
]
var equivParamValues: seq[string]
for v in equivParams:
equivParamValues.add(v.name)
proc tkn(label: tokenName, value: string): token =
var t: token
t.label = label
t.value = value
return t
proc newAst(ast_tk: token): abst =
var nw: abst
nw.head = ast_tk
return nw
var currentTk: token
var tokens: seq[token]
const
seperators = ['(','[', '{', '}', ']', ')', ':', ';',]
head_seperators = ['(','[', '{']
end_seperators = [')', ']', '}']
operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--", "<", ">"]
operatorStart = ['=','+','-','/', '*', '^', '<', '>']
var keywordStrings = ["let","var", "const", "fn", "return"]
proc reset(tk: var token) =
tk.label = default
tk.value = ""
proc unknownType(ch: char) =
case ch:
of '\"':
currentTk.label = str
of Digits:
currentTk.label = integer
currentTk.value = $ch
tokens.add(currentTk)
of IdentStartChars:
currentTk.label = ident
currentTk.value = $ch
of Whitespace:
return
of seperators:
currentTk.label = seperator
currentTk.value = $ch
tokens.add(currentTk)
currentTk.reset()
of operatorStart:
currentTk.label = operator
currentTk.value = $ch
else:
return
proc identType(ch: char) =
case ch
of operatorStart:
tokens.add(currentTk)
currentTk.reset()
currentTk.label = operator
currentTk.value = $ch
return
of seperators:
tokens.add(currentTk)
currentTk.reset()
currentTk.label = seperator
currentTk.value = $ch
tokens.add(currentTk)
currentTk.reset()
return
of Whitespace:
if keywordStrings.contains(currentTk.value):
currentTk.label = keyword
tokens.add(currentTk)
currentTk.reset()
return
of IdentChars:
currentTk.value.add(ch)
return
else:
return
proc strType(ch: char) =
if ch == '\"':
tokens.add(currentTk)
currentTk.reset()
else:
currentTk.value.add(ch)
proc numType(ch: char) =
case ch
of Digits:
currentTk.value.add(ch)
of '.':
currentTk.value.add(ch)
currentTk.label = double
of Whitespace:
tokens.add(currentTk)
currentTk.reset()
of operatorStart:
tokens.delete(tokens.len() - 1)
tokens.add(currentTk)
currentTk.reset()
currentTk.label = operator
currentTk.value = $ch
return
of seperators:
tokens.delete(tokens.len() - 1)
tokens.add(currentTk)
currentTk.reset()
currentTk.label = seperator
currentTk.value = $ch
tokens.add(currentTk)
currentTk.reset()
of IdentStartChars:
tokens.add(currentTk)
currentTk.label = ident
currentTk.value = $ch
else:
discard
proc opType(ch: char) =
if operators.contains($ch):
tokens.add(currentTk)
currentTk.reset()
if Whitespace.contains(ch):
if operators.contains(currentTk.value):
tokens.add(currentTk)
currentTk.reset()
else:
currentTk.value.add(ch)
for ch in file:
if currentTk.label == default:
unknownType(ch)
else:
case currentTk.label
of default:
break
of str:
strType(ch)
of ident:
identType(ch)
of integer, double:
numType(ch)
of operator:
opType(ch)
else:
continue
proc echoTks(tkns: seq[token]) =
for t in tokens:
echo t
var idx: int = 0
proc parseTkns(tkns: var seq[token], params: seq[string]): abst =
var ast: abst
if params.len() == 0:
while(true):
if tkns[idx].value in ["let", "var", "const"]:
ast.body.add(parseTkns(tkns, @["identifier", ":", "type"]))
idx += 1
if idx == tkns.len():
break
else:
var par: seq[string] = params
while(par.len() > idx):
echo par[idx]
if equivParams[equivParamValues.find(par[idx])].equiv_fn(tkns[idx]):
ast.body.add(newAst(tkns[idx]))
idx += 1
return ast
var ast = parseTkns(tokens, @[])
, And when I run it, it tells me that:
Traceback (most recent call last) srcmain.nim(213) main srcmain.nim(200) parseTkns SIGSEGV: Illegal storage access. (Attempt to read from nil?).
Does anyone know how to fix this, or another way to do this? I'm sure it has to do with the recursion on the lines.
it's because abst (please begin types with a capital letter btw) is a ref object, and you're dereferencing it when it is still nil.
you need to
var ast = new abst