// token-s.txt
//   public release som v3.0   5 March 2007 (Maka-bucha day)
//   public release som v3.1   19 Aug 2007 (Draft vote day)
//   public release som v4.0   2 July 2008

enum
    80 maxcol				// max depth of nested indent

// bitvector array 256 of hexval:hex:sep:space:letter
// for example <9> 1001:1:0:0:1 9*16+9
//
// cLetter 	# 0..9 A..Z _ a..z
// cSpace 	<1>..<32> <128>..<254>
// cSep 	"!%&()*+-/:<=>[]^{|} <0> <EOF_CHAR>
// cHex 	0..9 A..F a..f
// use gen-ctype-s.txt  to generate ctype[.]

ctype = array
    4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    2 4 4 1 2 4 4 2 4 4 4 4 2 4 2 4 9 25 41 57 73 89 105 121 137 153 4 2 4 4 4 2
    2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 4 4 1
    2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 2 2
    2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
    2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4

: isNum c = (c >= 48) & (c <= 57)	// 0..9
: isHex c = ctype[c] & 8
: isSpace c = ctype[c] & 2
: isSep c = ctype[c] & 4
: isLetter c = ctype[c] & 1
: hexval c = (ctype[c] >> 4) & 15

to initlex =
    inbuf = array 1000		// input line buffer
    tok = 0					// current token
    tokvalue = 0			// token value
    tokstring = array 256	// token string
    // column stack for block indentation
    tokcol = 0				// current token column
    colstk = array maxcol	// column stack
    colsp = 0

to pushcol col =
    if colsp >= maxcol
        seterror "lex error col stack overflow"
    else
        colsp = colsp + 1
        colstk[colsp] = col

to popcol =
    if colsp <= 0
        seterror "lex error col stack underflow"
    else
        colsp = colsp - 1

to startlex =
    CLEN = 0				// current token length
    TP = 0					// current token pointer
    line = 0				// current line number
    lexstate = 1			// state of lex
    oldtok = 0				// old token
    colsp = 0				// col stack pointer
    pushcol 0

// get whole word, update CLEN
to token | pos =
    pos = TP
    while isLetter inbuf[pos]
        pos = pos + 1
    CLEN = pos - TP

// read until " or end of line, CLEN whole str ".."
to collectstring | a c =
    a = TP+1
    c = inbuf[a]
    while (c != 34) & (c != 0) // until " or eol
        a = a + 1
        c = inbuf[a]
    CLEN = a - TP + 1

to tokNumber | c i v =
    token
    v = 0
    for i TP TP+CLEN-1
        c = inbuf[i]
        if ! isNum c
            seterror "invalid integer"
        v = v*10 + c - 48
    tokvalue = v
    tok = tkNUMBER

to tokHex | c i h =
    token
    c = inbuf[TP+1]			// check first digit
    if ! isHex c
        seterror "invalid hex"
    h = hexval c
    if h > 7
        h = h - 16			// sign extension
    for i TP+2 TP+CLEN-1
        c = inbuf[i]
        if ! isHex c
            seterror "invalid hex"
        h = h*16 + hexval c
    tokvalue = h
    tok = tkNUMBER

// to terminate a line, pad EOF at end of line
// for interactive mode r-e-p
to padEOF | pos =
    pos = 0
    while inbuf[pos] != 0
        pos = pos + 1
    inbuf[pos] = 255		// EOF_CHAR
    inbuf[pos+1] = 0

// affect CLEN, tok, tokvalue, tokstring. change TP
to lex1 | pos c d =
    pos = TP
    while 1				// read until get a token
        if (TP == 0) | (inbuf[pos] == 0)	// end buffer
            fgets FI inbuf					// get a line
            line = line + 1
            pos = 0
            if cmode == 2 padEOF			// interactive
        while isSpace inbuf[pos]			// skip space
            pos = pos + 1
        c = inbuf[pos]
        d = inbuf[pos+1]					// lookahead
        if c == 0 							// buffer empty
            TP = 0
        else if (c == 47) & (d == 47) 		// "//"
            TP = 0
        else
            break
    TP = pos
    tokcol = pos
    CLEN = 1
    case c
        48: tokNumber		// 0..9
        49: tokNumber
        50: tokNumber
        51: tokNumber
        52: tokNumber
        53: tokNumber
        54: tokNumber
        55: tokNumber
        56: tokNumber
        57: tokNumber
        34:					// " string
            collectstring
            strpack tokstring inbuf TP+1 CLEN-2
            tok = tkSTRING
        35: tokHex			// # hex num
        33: if d == 61		// !=
                CLEN = 2
                tok = tkNE
            else
                tok = tkNOT
        37: tok = tkMOD
        38: tok = tkAND
        40: tok = tkLPAREN
        41: tok = tkRPAREN
        42: tok = tkSTAR
        43: tok = tkPLUS
        45: tok = tkMINUS
        47: tok = tkSLASH
        58: tok = tkCOLON
        60: if d == 60		// <<
                CLEN = 2
                tok = tkLTLT
            else if d == 61 // <=
                CLEN = 2
                tok = tkLE
            else
                tok = tkLT
        61: if d == 61		// ==
                CLEN = 2
                tok = tkEQEQ
            else
                tok = tkEQ
        62: if d == 62		// >>
                CLEN = 2
                tok = tkGTGT
            else if d == 61 // >=
                CLEN = 2
                tok = tkGE
            else
                tok = tkGT
        91: tok = tkLBRACKET // [
        93: tok = tkRBRACKET // ]
        94: tok = tkCARET	 // ^
        123: tok = tkBB		 // {
        124: tok = tkBAR	 // |
        125: tok = tkBE		 // }
        else:
            if c == 255		// EOF
                tok = tkEOF
            else			// it is an indentifier
                token
                strpack tokstring inbuf TP CLEN
                tokvalue = install tokstring
                tok = tkIDEN
                if (getType tokvalue) == tyKEY // key word
                    tok = getRef tokvalue
    // end case
    TP = TP + CLEN

to prtoken tk = {}

// lexstate N=1, F=2, B=3
// state description   condition  action    next
//   N  normal  lex     oldline   out tok     N
//   N                  nl & ==   out tok     N
//   N                  nl & >    out { push  F
//   N                  nl & <    out } pop   B
//   F  forward         -         out oldtok  N
//   B  backward        ==        out oldtok  N
//   B                  <         out } pop   B
//   transition occurs on call lex
//   eof is a special case

to lex | oldline =
    case lexstate
        1:	// normal
            oldline = line
            lex1
//          printc 34 prtoken tok printc 34 space
            oldtok = tok
            if tok == tkEOF			// out BE til match
                if colstk[colsp] != 0
                    tok = tkBE
                    popcol
                break
            if line != oldline		// newline
                if tokcol > colstk[colsp]
                    tok = tkBB
                    pushcol tokcol
                    lexstate = 2	// F
                else if tokcol < colstk[colsp]
                    tok = tkBE
                    popcol
                    lexstate = 3	// B
                // tokcol == colstk, just return tok
        2:	// forward
            tok = oldtok
            lexstate = 1			// N
        3:	// backward
            if tokcol >= colstk[colsp]
                tok = oldtok
                lexstate = 1		// N
            else if tokcol < colstk[colsp]
                tok = tkBE
                popcol

// original prtoken (in som-v2) is 251 words
// this version using tok_str is 41 words

// print string of token, len 3 each
tok_str = array
    "*" 0 "/" 0 "-" 0 "+" 0 "=" 0 "==" 0 "&" 0 "|" 0
    "^" 0 "%" 0 "!" 0 "!=" 0 "<" 0 "<=" 0 "<<" 0 ">" 0
    ">=" 0 ">>" 0 ":" 0 "(" 0 ")" 0 "[" 0 "]" 0 "{" 0
    "}" 0 "to" 0 "if" 0 "else" 0 "while" "for" 0 "break" "array"
    "case" 0 "enum" 0 "syscall"

to prtoken tk | a =
    if tk > tkSYSCALL
        seterror "unknown token"
    if tk >= tkSTAR
        a = tk - tkSTAR		// a 0..34
        prints tok_str+(a*3)
        break
    case tk
        tkIDEN: prints tokstring
        tkNUMBER: print tokvalue
        tkSTRING:
            printc 34 prints tokstring printc 34
        tkEOF: prints "eof"

// print atom
to pratom type ref =
    case type
        SP:
            prints "NIL"
        OPER:
            prtoken ref
        ADS:
            printc 36 print ref	// $
        NUM:
            print ref
        GNAME:
            prints getName ref
        LNAME:
            printc 35 print ref // #
        CNAME:
            prints "classname"
        STRING:
            printc 34 prints ref printc 34

// print list
to prlist a =
    if a == NIL break
    if isatom a
        pratom (car a) (cdr a)
        space
    else
        printc 40   // "("
        while a != NIL
            prlist car a
            a = cdr a
        printc 41   // ")"

to enterKey nm ref | a =
    a = install nm
    setType a tyKEY
    setRef a ref

to initkeysym =
    init_symtab
    enterKey "to" tkTO
    enterKey "if" tkIF
    enterKey "else" tkELSE
    enterKey "while" tkWHILE
    enterKey "for" tkFOR
    enterKey "break" tkBREAK
    enterKey "array" tkARRAY
    enterKey "case" tkCASE
    enterKey "enum" tkENUM
    enterKey "syscall" tkSYSCALL

// End
