// token-s.txt
//   public release som v3.0   5 March 2007 (Maka-bucha day)
//   public release som v3.1   19 Aug 2007 (Draft vote day)
//   public release som v4.0   2 July 2008
//   public release som v4.1   9 Aug 2008 (Birthday)


enum
	80 maxcol				// max depth of nested indent

// bitvector array 256 of hexval:hex:sep:space:letter
// for example <9> 1001:1:0:0:1 9*16+9
//
// cLetter 	# 0..9 A..Z _ a..z
// cSpace 	<1>..<32> <128>..<254>
// cSep 	"!%&()*+-/:<=>[]^{|} <0> <EOF_CHAR>
// cHex 	0..9 A..F a..f
// use gen-ctype-s.txt  to generate ctype[.]

ctype = array
	4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 4 4 1 2 4 4 2 4 4 4 4 2 4 2 4 9 25 41 57 73 89 105 121 137 153 4 2 4 4 4 2
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 4 4 1
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4

: isNum c = (c >= 48) & (c <= 57)	// 0..9
: isHex c = ctype[c] & 8
: isSpace c = ctype[c] & 2
: isSep c = ctype[c] & 4
: isLetter c = ctype[c] & 1
: hexval c = (ctype[c] >> 4) & 15
: lex2 = syscall {16}

to initlex =
	inbuf = array 1000		// input line buffer
	tok = 0					// current token
	tokvalue = 0			// token value
	tokstring = array 256	// token string
	// column stack for block indentation
	tokcol = 0				// current token column
	colstk = array maxcol	// column stack
	colsp = 0

to pushcol col =
	if colsp >= maxcol
		seterror "lex error col stack overflow"
	else
		colsp = colsp + 1
		colstk[colsp] = col

to popcol =
	if colsp <= 0
		seterror "lex error col stack underflow"
	else
		colsp = colsp - 1

to startlex =
	CLEN = 0				// current token length
	TP = 0					// current token pointer
	line = 0				// current line number
	lexstate = 1			// state of lex
	oldtok = 0				// old token
	colsp = 0				// col stack pointer
	pushcol 0

// get whole word, update CLEN
to token | pos =
	pos = TP
	while isLetter inbuf[pos]
		pos = pos + 1
	CLEN = pos - TP

// read until " or end of line, CLEN whole str ".."
to collectstring | a c =
	a = TP+1
	c = inbuf[a]
	while (c != 34) & (c != 0) // until " or eol
		a = a + 1
		c = inbuf[a]
	CLEN = a - TP + 1

to tokNumber | c i v =
	token
	v = 0
	for i TP TP+CLEN-1
		c = inbuf[i]
		if ! isNum c
			seterror "invalid integer"
		v = v*10 + c - 48
	tokvalue = v
	tok = tkNUMBER

to tokHex | c i h =
	token
	c = inbuf[TP+1]			// check first digit
	if ! isHex c
		seterror "invalid hex"
	h = hexval c
	if h > 7
		h = h - 16			// sign extension
	for i TP+2 TP+CLEN-1
		c = inbuf[i]
		if ! isHex c
			seterror "invalid hex"
		h = h*16 + hexval c
	tokvalue = h
	tok = tkNUMBER

// to terminate a line, pad EOF at end of line
// for interactive mode r-e-p
to padEOF | pos =
	pos = 0
	while inbuf[pos] != 0
		pos = pos + 1
	inbuf[pos] = 255		// EOF_CHAR
	inbuf[pos+1] = 0

// affect CLEN, tok, tokvalue, tokstring. change TP
to lex1 | pos c d =
	pos = TP
	while 1				// read until get a token
		if or (TP == 0)(inbuf[pos] == 0)	// end buffer
			fgets FI inbuf					// get a line
			line = line + 1
			pos = 0
			if cmode == 2 padEOF			// interactive
		while isSpace inbuf[pos]			// skip space
			pos = pos + 1
		c = inbuf[pos]
		d = inbuf[pos+1]					// lookahead
		if c == 0 							// buffer empty
			TP = 0
		else if and (c == 47) (d == 47) 	// "//"
			TP = 0
		else
			break
	TP = pos
	tokcol = pos
	CLEN = 1
	case c
		48: tokNumber		// 0..9
		49: tokNumber
		50: tokNumber
		51: tokNumber
		52: tokNumber
		53: tokNumber
		54: tokNumber
		55: tokNumber
		56: tokNumber
		57: tokNumber
		34:					// " string
			collectstring
			strpack tokstring inbuf TP+1 CLEN-2
			tok = tkSTRING
		35: tokHex			// # hex num
		33: if d == 61		// !=
				CLEN = 2
				tok = tkNE
			else
				tok = tkNOT
		37: tok = tkMOD
		38: tok = tkAND
		40: tok = tkLPAREN
		41: tok = tkRPAREN
		42: tok = tkSTAR
		43: tok = tkPLUS
		45: tok = tkMINUS
		47: tok = tkSLASH
		58: tok = tkCOLON
		60: if d == 60		// <<
				CLEN = 2
				tok = tkLTLT
			else if d == 61 // <=
				CLEN = 2
				tok = tkLE
			else
				tok = tkLT
		61: if d == 61		// ==
				CLEN = 2
				tok = tkEQEQ
			else
				tok = tkEQ
		62: if d == 62		// >>
				CLEN = 2
				tok = tkGTGT
			else if d == 61 // >=
				CLEN = 2
				tok = tkGE
			else
				tok = tkGT
		91: tok = tkLBRACKET // [
		93: tok = tkRBRACKET // ]
		94: tok = tkCARET	 // ^
		123: tok = tkBB		 // {
		124: tok = tkBAR	 // |
		125: tok = tkBE		 // }
		else:
			if c == 255		// EOF
				tok = tkEOF
			else			// it is an indentifier
				token
				strpack tokstring inbuf TP CLEN
				tokvalue = install tokstring
				tok = tkIDEN
				if (getType tokvalue) == tyKEY // key word
					tok = getRef tokvalue
	// end case
	TP = TP + CLEN

to lex3 =
	tok = lex2
	if tok == tkIDEN
		tokvalue = install tokstring
		if (getType tokvalue) == tyKEY // key word
			tok = getRef tokvalue

to prtoken tk = {}

// lexstate N=1, F=2, B=3
// state description   condition  action    next
//   N  normal  lex     oldline   out tok     N
//   N                  nl & ==   out tok     N
//   N                  nl & >    out { push  F
//   N                  nl & <    out } pop   B
//   F  forward         -         out oldtok  N
//   B  backward        ==        out oldtok  N
//   B                  <         out } pop   B
//   transition occurs on call lex
//   eof is a special case

to lex | oldline =
	case lexstate
		1:	// normal
			oldline = line
//			lex1
			lex3
//			printc 34 prtoken tok printc 34 space
			oldtok = tok
			if tok == tkEOF			// out BE til match
				if colstk[colsp] != 0
					tok = tkBE
					popcol
				break
			if line != oldline		// newline
				if tokcol > colstk[colsp]
					tok = tkBB
					pushcol tokcol
					lexstate = 2	// F
				else if tokcol < colstk[colsp]
					tok = tkBE
					popcol
					lexstate = 3	// B
				// tokcol == colstk, just return tok
		2:	// forward
			tok = oldtok
			lexstate = 1			// N
		3:	// backward
			if tokcol >= colstk[colsp]
				tok = oldtok
				lexstate = 1		// N
			else // if tokcol < colstk[colsp]
				tok = tkBE
				popcol

// original prtoken (in som-v2) is 251 words
// this version using tok_str is 41 words

// print string of token, len 3 each
tok_str = array
	"*" 0 "/" 0 "-" 0 "+" 0 "=" 0 "==" 0 "&" 0 "|" 0
	"^" 0 "%" 0 "!" 0 "!=" 0 "<" 0 "<=" 0 "<<" 0 ">" 0
	">=" 0 ">>" 0 ":" 0 "(" 0 ")" 0 "[" 0 "]" 0 "{" 0
	"}" 0 "to" 0 "if" 0 "else" 0 "while" "for" 0 "break" "array"
	"case" 0 "enum" 0 "syscall"

to prtoken tk | a =
	if tk > tkSYSCALL
		seterror "unknown token"
	if tk >= tkSTAR
		a = tk - tkSTAR		// a 0..34
		prints tok_str+(a*3)
		break
	case tk
		tkIDEN: prints tokstring
		tkNUMBER: print tokvalue
		tkSTRING:
			printc 34 prints tokstring printc 34
		tkEOF: prints "eof"

// print atom
to pratom type ref =
	case type
		SP:
			prints "NIL"
		OPER:
			prtoken ref
		ADS:
			printc 36 print ref	// $
		NUM:
			print ref
		GNAME:
			prints getName ref
		LNAME:
			printc 35 print ref // #
		CNAME:
			prints "classname"
		STRING:
			printc 34 prints ref printc 34

// print list
to prlist a =
	if a == NIL break
	if isatom a
		pratom (car a) (cdr a)
		space
	else
		printc 40   // "("
		while a != NIL
			prlist car a
			a = cdr a
		printc 41   // ")"

to enterKey nm ref | a =
	a = install nm
	setType a tyKEY
	setRef a ref

to initkeysym =
	init_symtab
	enterKey "to" tkTO
	enterKey "if" tkIF
	enterKey "else" tkELSE
	enterKey "while" tkWHILE
	enterKey "for" tkFOR
	enterKey "break" tkBREAK
	enterKey "array" tkARRAY
	enterKey "case" tkCASE
	enterKey "enum" tkENUM
	enterKey "syscall" tkSYSCALL

// End

