nimforum mirror - Trying to make a lexer, stops if it hits an operator.

BombManYeeted (orginal) [2023-01-07T03:44:59+01:00] view original

 nim
import std/strutils
import std/sequtils

type
    tkname = enum
        def, id, str, num, kw, uk, op
    token = object
        label: tkName
        value: string

method reset(tk: var token) {.base.} =
    tk.label = def
    tk.value = ""

method toStr(tk: token): string {.base.} =
    var res: string
    for ch in tk.value:
        res.add(ch)
    return res

proc newTk(): token =
    var res: token
    return res

proc ignore*[T](value: T) =
    discard

proc validIndex*[arrsize, arrtype](arr: array[arrsize, arrtype], index: int): bool =
    try:
        ignore[arrtype](arr[index])
        return true
    except IndexDefect:
        return false

const keywords = ["let", "const", "var", "if", "else"]
const decKeywords: array[3, string] = ["let", "const", "var"]
const operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--"]
const operatorStart = ['=','+','-','/', '*', '^']

let compilationTarget = readFile("./ex/example.txt")

var
    tokens: seq[token]
    currentToken: token
    tkIdx: Natural = 0

currentToken.label = def

echo compilationTarget

for ch in compilationTarget:
    if currentToken.label == def:
        case ch
        of Whitespace:
            continue
        of '\"':
            currentToken.label = str
            continue
        of Digits:
            currentToken.label = num
            currentToken.value = $ch
        of IdentStartChars:
            currentToken.label = id
            currentToken.value = $ch
        of operatorStart:
            currentToken.label = uk
            currentToken.value = $ch
        else:
            continue
        continue
    
    case currentToken.label
    of def:
        break
    of str:
        if ch == '\"':
            echo int tkIdx
            let throwaway = @[currentToken]
            tokens = tokens & throwaway
            currentToken.reset()
            tkIdx += 1
        else:
            currentToken.value.add(ch)
    of id:
        if keywords.contains(currentToken.value):
            currentToken.label = kw
            echo int tkIdx
            let throwaway = @[currentToken]
            tokens = tokens & throwaway
            currentToken.reset()
            tkIdx += 1
        
        if Whitespace.contains(ch):
            echo int tkIdx
            let throwaway = @[currentToken]
            tokens = tokens & throwaway
            currentToken.reset()
            tkIdx += 1
        else:
            currentToken.value.add(ch)
    of num:
        if Whitespace.contains(ch):
            echo int tkIdx
            let throwaway = @[currentToken]
            tokens = tokens & throwaway
            currentToken.reset()
            tkIdx += 1
        else:
            currentToken.value.add(ch)
    of uk:
        if Whitespace.contains(ch) or IdentStartChars.contains(ch):
            if operators.contains(currentToken.value):
                
                # Stops here.
                
                echo int tkIdx
                currentToken.label = op
                tokens.add(currentToken)
                currentToken.value = ""
                currentToken.label = def
                tkIdx += 1
        else:
            currentToken.value.add(ch)
    of kw:
        currentToken.label = def
    of op:
        currentToken.label = def
    else:
        continue

for cTk in tokens:
    if cTk.label != def:
        echo cTk

shirleyquirk (orginal) [2023-01-07T10:19:16+01:00] view original

You can add code blocks with triple backticks, and check that the formatting worked with the 'preview'

shirleyquirk (orginal) [2023-01-07T14:29:38+01:00] view original

Oh that's so funny. When your code had all the whitespace removed, I did my best guess to reformat it, and there was one ambiguity. I had added an extra tab to the else in the of uk block, and it worked for me, except for the bug where it requires the input text end with whitespace

shirleyquirk (orginal) [2023-01-08T13:38:09+01:00] view original

i tend to do something along these lines:

import std/[strutils,parseutils]

type
    TokKind = enum
        def, id, str, num, kw, uk, op
    Token = object
        label: TokKind
        value: string

const
  keywords = ["let", "const", "var", "if", "else"]
  defKeywords: array[3, string] = ["let", "const", "var"]
  operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--"]
  operatorStart = {'=','+','-','/','*','^',}

let compilationTarget = readFile("/tmp/lex/example.txt")

var tokens: seq[Token]

echo compilationTarget

var idx = compilationTarget.skipWhitespace()
while idx < compilationTarget.len:
  
  proc parseUntil(label: TokKind, until: set[char]|char):Token =
    result.label = label
    idx += compilationTarget.parseUntil(result.value, until, start = idx)
  proc parseWhile(label: TokKind, validChars: set[char]):Token =
    result.label = label
    idx += compilationTarget.parseWhile(result.value, validChars, start = idx)
  
  tokens.add case compilationTarget[idx]
  of '\"':
    str.parseUntil('\"')
  of Digits:
    num.parseWhile(Digits)
   #plus {'.','e'} maybe, or parseuntil whitespace, sure why not
   #i guess you're tokenizing `-1` as two tokens
  of operatorStart:
    op.parseWhile(operatorStart)
    # ok this isn't quite right but you get the idea
  of IdentStartChars:
    var tmp = id.parseWhile(IdentChars)
    if tmp.value in keywords:
      tmp.label = kw
    tmp
  else:#unknown? is that what 'uk' means?
    let tmp = Token(label: uk, value: $compilationTarget[idx])
    inc idx
    tmp
  
  idx += compilationTarget.skipWhitespace(idx)


for cTk in tokens:
  echo cTk

georgelemon (orginal) [2023-01-09T14:50:20+01:00] view original

Check std/lexbase.

Or even better, use my toktok package, a generic tokenizer based on Nim macros, lexbase and strutils.

Basic example:

import toktok

static:
    Program.settings(uppercase = true, prefix = "TK_")  # limitation: must start with TK_

tokens:
  Plus      > '+'
  Minus     > '-'
  Divide    > '/':
    Block_Comment ? '*' .. "*/"
    Inline_Comment ? '/' .. EOL
  Assign    > '='
  Alt_Comment   > '#' .. EOL
  Let       > "let"
  Const     > "const"
  True      > {"TRUE", "True", "true"}
  False     > {"FALSE", "False", "false"}


# sample.txt
let user = "john"

Use getToken() to parse each token. This proc returns a TokenTuple:

(kind: TK_LET, value: "let", wsno: 0, line: 1, col: 0, pos: 0)
(kind: TK_IDENTIFIER, value: "user", wsno: 1, line: 1, col: 4, pos: 4)
(kind: TK_ASSIGN, value: "", wsno: 0, line: 1, col: 9, pos: 9)
(kind: TK_STRING, value: "john", wsno: 1, line: 1, col: 11, pos: 11)

Full example of toktok in action, implementing a basic parser and AST handler https://github.com/openpeep/toktok/blob/main/examples/program.nim

axyz (orginal) [2023-01-10T11:20:37+01:00] view original

Hi, I have a quick question about BaseLexer open method in std/lexbase, that given your experience on the topic you may have an answer for.

It seems that the lexer is optimized for line based parsing? What is the best idiomatic way to parse a language that can ignore new lines? Currently for testing purpose my workaround is to do something like

lexbase.open(self, input, 999999)

where 999999 is an arbitrarily large number for the bufLen in

proc open(L: var BaseLexer; input: Stream; bufLen: int = 8192;
          refillChars: set[char] = NewLines) {.....}

but this is quite unreliable and probably not very efficient.

georgelemon (orginal) [2023-01-10T18:35:12+01:00] view original

Why ignore whitespaces/newlines ? You need this kind of info. What you want is to skip tokenizing the char itself but at the same time will store whitespace and line number in a tuple, for each getToken call


TokenTuple* = tuple[kind: TokenKind, value: string, wsno, line, column: int]

Here is an example using only std libraries. You'll need a lexer, a parser and ast representation. Now let's make this shine!

# lexer.nim
import std/[lexbase, streams]
from std/strutils import Whitespace, `%`

type
  TokenKind* = enum
    tkUnknown
    tkIdent
    tkColon
    tkSemicolon
    tkLet
    tkConst
    tkBool
    tkString
    tkAssign
    tkEof
  
  TokenTuple* = tuple[kind: TokenKind, value: string, wsno, line, column: int]
  
  Lexer* = object of BaseLexer
    kind: TokenKind
    token: string
    startPos, wsno: int
    multiLineStr: bool
    error: string

proc setError*(lex: var Lexer; message: string) =
    lex.kind = tkUnknown
    if lex.error.len == 0:
        lex.error = message

proc hasError*(lex: Lexer): bool {.inline.} =
    result = lex.error.len != 0

proc getError*(lex: Lexer): string {.inline.} =
    result = "($1:$2) $3" % [$lex.lineNumber, $(lex.startPos), lex.error]

# defer
proc handleIdentifier(lex: var Lexer)
proc handleSpecial(lex: var Lexer): char
proc handleString(lex: var Lexer)

proc handleNewLine(lex: var Lexer) =
  case lex.buf[lex.bufpos]
  of '\c': lex.bufpos = lex.handleCR(lex.bufpos)
  of '\n': lex.bufpos = lex.handleLF(lex.bufpos)
  else: discard

proc skip(lex: var Lexer) =
  var wsno: int
  while true:
    case lex.buf[lex.bufpos]
    of Whitespace:
      if lex.buf[lex.bufpos] in NewLines:
        lex.handleNewLine()
      else:
        inc lex.bufpos
        inc wsno
    else:
      lex.wsno = wsno
      return

proc skip(lex: var Lexer, keepLastWhitespace: bool) =
  var wsno: int
  while true:
    case lex.buf[lex.bufpos]
    of Whitespace:
      if lex.buf[lex.bufpos] in NewLines:
        lex.handleNewLine()
      else:
        try:
          if lex.buf[lex.bufpos + 1] notin Whitespace and
              lex.buf[lex.bufpos + 1] notin NewLines:
            lex.wsno = wsno
            break
        except IndexDefect: discard
        inc lex.bufpos
        inc wsno
    else:
      lex.wsno = wsno
      break

proc setToken(lex: var Lexer, kind: TokenKind, offset = 1) =
  lex.startPos = lex.getColNumber(lex.bufpos)
  lex.kind = kind
  inc lex.bufpos, offset

proc getToken*(lex: var Lexer): TokenTuple =
  lex.startPos = lex.getColNumber(lex.bufpos)
  setLen(lex.token, 0) # reset token value
  skip lex
  case lex.buf[lex.bufpos]:
  of '=':
    lex.setToken(tkAssign)
  of ';':
    lex.setToken(tkSemicolon)
  of ':':
    lex.setToken(tkColon)
  of '"':
    lex.handleString()
  of {'a' .. 'z', 'A' .. 'Z', '_'}:
    lex.handleIdentifier()
  of EndOfFile:
    lex.startPos = lex.getColNumber(lex.bufpos)
    lex.kind = tkEof
  else:
    lex.setToken(tkUnknown)
  (
    kind: lex.kind,
    value: lex.token,
    wsno: lex.wsno,
    line: lex.lineNumber,
    column: lex.startPos
  )

proc handleIdentifier(lex: var Lexer) =
  lex.startPos = lex.getColNumber(lex.bufpos)
  setLen(lex.token, 0)
  while true:
    if lex.buf[lex.bufpos] in {'a'..'z', 'A'..'Z', '_'}:
      add lex.token, lex.buf[lex.bufpos]
      inc lex.bufpos
    else:
      dec lex.bufpos
      break
  case lex.token:
  of "let":
    lex.setToken(tkLet)
  of "const":
    lex.setToken(tkConst)
  of "true", "false":
    lex.setToken(tkBool)
  else:
    lex.setToken(tkIdent)

proc handleSpecial(lex: var Lexer): char =
  inc lex.bufpos
  case lex.buf[lex.bufpos]
  of 'n':
    lex.token.add "\\n"
    result = '\n'
    inc lex.bufpos
  of '\\':
    lex.token.add "\\\\"
    result = '\\'
    inc lex.bufpos
  else:
    lex.setError("Unknown escape sequence: '\\" & lex.buf[lex.bufpos] & "'")
    result = '\0'

proc handleString(lex: var Lexer) =
  lex.startPos = lex.getColNumber(lex.bufpos)
  setLen(lex.token, 0)
  let lineno = lex.lineNumber
  inc lex.bufpos
  while true:
    case lex.buf[lex.bufpos]
    of '\\':
      discard lex.handleSpecial()
      if lex.hasError(): return
    of '"':
      lex.kind = tkString
      inc lex.bufpos
      break
    of NewLines:
      if lex.multiLineStr:
        lex.skip(keepLastWhitespace = true)
      else:
        lex.setError("EOL reached before end of string")
        return
    of EndOfFile:
      lex.setError("EOF reached before end of string")
      return
    else:
      add lex.token, lex.buf[lex.bufpos]
      inc lex.bufpos
  if lex.multiLineStr:
    lex.lineNumber = lineno

# ast.nim
from std/strutils import parseBool, parseInt

when not defined release:
  import std/[json, jsonutils]

type
  NodeType = enum
    NTLet
    NTConst
    NTFunction
    NTStmtList
    NTBool
    NTString
  
  Node* = ref object
    case nodeType: NodeType
    of NTBool:
      bVal: bool
    of NTString:
      sVal: string
    of NTLet:
      letIdent: string
      letValue: Node
    of NTConst:
      constIdent: string
      constValue: Node
    of NTFunction:
      fnIdent: string
      fnParams: seq[Node]
      fnBody: Node # NTStmtList
    of NTStmtList:
      stmtList: seq[Node]
  
  Program* = object
    nodes: seq[Node]

proc `&=`*(program: var Program, node: Node) =
    program.nodes.add node

proc newBool*(value: string): Node =
  Node(nodeType: NTBool, bVal: parseBool(value))

proc newString*(value: string): Node =
  Node(nodeType: NTString, sVal: value)

proc newLet*(ident: string, value: Node): Node =
  Node(nodeType: NTLet, letIdent: ident, letValue: value)

proc newConst*(ident: string, value: Node): Node =
  Node(nodeType: NTConst, constIdent: ident, constValue: value)

when not defined release:
  proc `$`*(node: Program, indent = 2): string =
    pretty toJson(node), indent

# parser.nim
import ./ast
export ast

include ./lexer

type
  Parser = object
    lex*: Lexer
    error: string
    prev*, current*, next*: TokenTuple
    program: Program
  
  PrefixFunction = proc(p: var Parser): Node

proc setError(p: var Parser, msg: string) =
    p.error = "Error ($2:$3): $1" % [msg, $p.current.line, $p.current.column]

proc hasError*(p: var Parser): bool {.inline.} =
    p.error.len != 0

proc getError*(p: var Parser): string =
    result = "($1:$2) $3" % [$p.current.line, $p.current.column, p.error]

proc walk(p: var Parser, offset = 1) =
  var i = 0
  while offset != i:
    p.prev = p.current
    p.current = p.next
    p.next = p.lex.getToken()
    inc i

proc parseVarDefinition(p: var Parser): Node =
  let this = p.current
  walk p
  let identName = p.current.value
  if p.next.kind != tkAssign:
    p.setError("Missing assignment")
    return
  walk p, 2
  var identValue: Node = case p.current.kind:
                          of tkString: newString(p.current.value)
                          of tkBool:   newBool(p.current.value)
                          else:
                            p.setError("Invalid value assignment")
                            nil
  if this.kind == tkLet:
    result = newLet(identName, identValue)
  elif this.kind == tkConst:
    result = newConst(identName, identValue)
  if p.next.kind != tkSemicolon:
    p.setError("Missing semicolon")
    return
  walk p, 2

proc unexpected(p: var Parser): Node =
  p.setError("Unexpected token \"$1\"" % [
    if p.current.value.len != 0:
      p.current.value
    else:
      $p.current.kind
  ])

proc getPrefixFn(p: var Parser, kind: TokenKind): PrefixFunction =
  result = case kind:
    of tkLet, tkConst:
      parseVarDefinition
    else: unexpected

proc parseExp(p: var Parser): Node =
  let this = p.current
  let callPrefixFn = p.getPrefixFn(this.kind)
  let exp: Node = p.callPrefixFn()
  if exp != nil:
    result = exp

proc parseProgram*(p: var Parser) =
  while p.current.kind != tkEof and (p.hasError == false and p.lex.hasError == false):
    let node = p.parseExp()
    if node != nil:
      p.program &= node

proc getAst*(p: var Parser): Program =
    result = p.program

proc close*(p: var Parser) =
    p.lex.close()

proc newParser*(fileContents: string): Parser =
  result = Parser(
    lex: Lexer(
      kind: tkUnknown,
      startPos: 0,
      token: "",
      error: ""
    ),
    program: Program()
  )
  result.lex.open(newStringStream(fileContents))
  result.current = result.lex.getToken()
  result.next = result.lex.getToken()

Main file.

import ./parser

when isMainModule:
  var p = newParser(readFile("sample.txt"))
  p.parseProgram()
  
  if p.lex.hasError:
    echo p.lex.getError()
  elif p.hasError:
    echo p.getError()
  else:
    when not defined release:
      echo p.getAst()
    else: discard
  
  p.close()

your sample.txt


let a = true; let b = "who"; const c = false;

Tokens:

(kind: tkLet, value: "let", wsno: 0, line: 1, column: 2)
(kind: tkIdent, value: "a", wsno: 1, line: 1, column: 4)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 6)
(kind: tkBool, value: "true", wsno: 1, line: 1, column: 11)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 12)
(kind: tkLet, value: "let", wsno: 1, line: 1, column: 16)
(kind: tkIdent, value: "b", wsno: 1, line: 1, column: 18)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 20)
(kind: tkString, value: "who", wsno: 1, line: 1, column: 22)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 27)
(kind: tkConst, value: "const", wsno: 1, line: 1, column: 33)
(kind: tkIdent, value: "c", wsno: 1, line: 1, column: 35)
(kind: tkAssign, value: "", wsno: 1, line: 1, column: 37)
(kind: tkBool, value: "false", wsno: 1, line: 1, column: 43)
(kind: tkSemicolon, value: "", wsno: 0, line: 1, column: 44)

AST Output:


{
  "nodes": [
    {
      "nodeType": 0,
      "letIdent": "a",
      "letValue": {
        "nodeType": 4,
        "bVal": true
      }
    },
    {
      "nodeType": 0,
      "letIdent": "b",
      "letValue": {
        "nodeType": 5,
        "sVal": "who"
      }
    },
    {
      "nodeType": 1,
      "constIdent": "c",
      "constValue": {
        "nodeType": 4,
        "bVal": false
      }
    }
  ]
}

Rock n roll!

axyz (orginal) [2023-01-11T10:02:51+01:00] view original

Thanks for the extensive example, I will have a look in detail.

Maybe, though, my problem was not too clear. I have no problem in ignoring the new lines or white spaces. My issue is that when using the open method of BaseLexer with the default BufLen and I try to parse a file where all the code is in a single line longer than the BufLen the program crashes.

I can see in your example you are also using result.lex.open(newStringStream(fileContents)) with default BufLen, will it work if your test file has more than 8192 characters in one line?

georgelemon (orginal) [2023-01-11T14:34:05+01:00] view original

Ah, that makes sense now. What about len(fileContents)?

Anyway, for really big files there is std/memfiles

axyz (orginal) [2023-01-11T15:21:39+01:00] view original

may have to dig deeper in the BaseLexer source, but I think the problem is not about reading a large file, but the fact that the lexer buffers one line at the time and has this BufLen limit instead of having something like a moving window or similar. My assumption is that this works well for line based languages (e.g. nim itself), but in a language where going to a new line is optional it will fail.

setting the buffer length to len(fileContents) is a quick solution, but I fear may not be very efficient (e.g. extremely large file with normal length lines), hence my questions if there is a different more idiomatic way to solve this problem. It may be that the base lexer is indeed optimized for line based languages and may need a completely different tool, or maybe there is some option I am not aware of to bypass this limitation, seems anyway an interesting use case.

Araq (orginal) [2023-01-11T16:13:38+01:00] view original

lexbase is not "line based", use the refillChars optional parameter:

proc open*(L: var BaseLexer, input: Stream, bufLen: int = 8192;
           refillChars: set[char] = NewLines)

You need to call handleRefillChar for every char that is in your refillChars set. But using a memory mapped file instead should be way easier and faster too. But lexbase works on embedded devices without an MMU.

BombManYeeted (orginal) [2023-01-16T05:28:19+01:00] view original

I've made a new lexer, since I had a hard time making a parser. Here it it:

import std/strutils
import std/os

let file = readFile(paramStr(1))

type
    tokenName = enum
        default, unknown, ident, str, integer, double, keyword, operator, seperator
    token = object
        label: tokenName
        value: string
    abstNode = object
        head: token
        body: seq[abst]
    abst = ref abstNode

let equivParams = [
    (name: "none", equiv_fn: proc (x: token): bool =
        return [default, unknown].contains(x.label)
    ),
    (name: "identifier", equiv_fn: proc (x: token): bool =
        return x.label == ident
    ),
    (name: "type", equiv_fn: proc (x: token): bool =
        return ["int", "char", "string"].contains(x.value)
    ),
    (name: "=", equiv_fn: proc (x: token): bool =
        return x.label == operator and x.value == "="
    ),
    (name: ":", equiv_fn: proc (x: token): bool =
        return x.label == seperator and x.value == ":"
    )
]

var equivParamValues: seq[string]

for v in equivParams:
    equivParamValues.add(v.name)

proc tkn(label: tokenName, value: string): token =
    var t: token
    t.label = label
    t.value = value
    return t

proc newAst(ast_tk: token): abst =
    var nw: abst
    nw.head = ast_tk
    return nw

var currentTk: token
var tokens: seq[token]

const
    seperators = ['(','[', '{', '}', ']', ')', ':', ';',]
    head_seperators = ['(','[', '{']
    end_seperators = [')', ']', '}']
    operators = ["=", "+", "-", "/", "*", "^", "==", "++", "--", "<", ">"]
    operatorStart = ['=','+','-','/', '*', '^', '<', '>']

var keywordStrings = ["let","var", "const", "fn", "return"]

proc reset(tk: var token) =
    tk.label = default
    tk.value = ""

proc unknownType(ch: char) =
    case ch:
    of '\"':
        currentTk.label = str
    of Digits:
        currentTk.label = integer
        currentTk.value = $ch
        tokens.add(currentTk)
    of IdentStartChars:
        currentTk.label = ident
        currentTk.value = $ch
    of Whitespace:
        return
    of seperators:
        currentTk.label = seperator
        currentTk.value = $ch
        tokens.add(currentTk)
        currentTk.reset()
    of operatorStart:
        currentTk.label = operator
        currentTk.value = $ch
    else:
        return

proc identType(ch: char) =
    case ch
    of operatorStart:
        tokens.add(currentTk)
        currentTk.reset()
        currentTk.label = operator
        currentTk.value = $ch
        return
    of seperators:
        tokens.add(currentTk)
        currentTk.reset()
        currentTk.label = seperator
        currentTk.value = $ch
        tokens.add(currentTk)
        currentTk.reset()
        return
    of Whitespace:
        if keywordStrings.contains(currentTk.value):
            currentTk.label = keyword
        tokens.add(currentTk)
        currentTk.reset()
        return
    of IdentChars:
        currentTk.value.add(ch)
        return
    else:
        return

proc strType(ch: char) =
    if ch == '\"':
        tokens.add(currentTk)
        currentTk.reset()
    else:
        currentTk.value.add(ch)

proc numType(ch: char) =
    case ch
    of Digits:
        currentTk.value.add(ch)
    of '.':
        currentTk.value.add(ch)
        currentTk.label = double
    of Whitespace:
        tokens.add(currentTk)
        currentTk.reset()
    of operatorStart:
        tokens.delete(tokens.len() - 1)
        tokens.add(currentTk)
        currentTk.reset()
        currentTk.label = operator
        currentTk.value = $ch
        return
    of seperators:
        tokens.delete(tokens.len() - 1)
        tokens.add(currentTk)
        currentTk.reset()
        currentTk.label = seperator
        currentTk.value = $ch
        tokens.add(currentTk)
        currentTk.reset()
    of IdentStartChars:
        tokens.add(currentTk)
        currentTk.label = ident
        currentTk.value = $ch
    else:
        discard

proc opType(ch: char) =
    if operators.contains($ch):
        tokens.add(currentTk)
        currentTk.reset()
    if Whitespace.contains(ch):
        if operators.contains(currentTk.value):
            tokens.add(currentTk)
            currentTk.reset()
    else:
        currentTk.value.add(ch)

for ch in file:
    if currentTk.label == default:
        unknownType(ch)
    else:
        case currentTk.label
        of default:
            break
        of str:
            strType(ch)
        of ident:
            identType(ch)
        of integer, double:
            numType(ch)
        of operator:
            opType(ch)
        else:
            continue

proc echoTks(tkns: seq[token]) =
    for t in tokens:
        echo t

var idx: int = 0

proc parseTkns(tkns: var seq[token], params: seq[string]): abst  =
    var ast: abst
    
    if params.len() == 0:
        
        while(true):
            if tkns[idx].value in ["let", "var", "const"]:
                ast.body.add(parseTkns(tkns, @["identifier", ":", "type"]))
            idx += 1
            if idx == tkns.len():
                break
    else:
        var par: seq[string] = params
        while(par.len() > idx):
            echo par[idx]
            if equivParams[equivParamValues.find(par[idx])].equiv_fn(tkns[idx]):
                ast.body.add(newAst(tkns[idx]))
            idx += 1
    return ast

var ast = parseTkns(tokens, @[])

, And when I run it, it tells me that:

Traceback (most recent call last) srcmain.nim(213) main srcmain.nim(200) parseTkns SIGSEGV: Illegal storage access. (Attempt to read from nil?).

Does anyone know how to fix this, or another way to do this? I'm sure it has to do with the recursion on the lines.

shirleyquirk (orginal) [2023-01-16T16:33:31+01:00] view original

it's because abst (please begin types with a capital letter btw) is a ref object, and you're dereferencing it when it is still nil.

you need to

var ast = new abst

Mirror of forum.nim-lang.org

9804 :: Trying to make a lexer, stops if it hits an operator.