// token-s.txt
//   public release som v3.0   5 March 2007 (Maka-bucha day)
//   public release som v3.1   19 Aug 2007 (Draft vote day)
//   public release som v4.0   2 July 2008
//   public release som v4.1   9 Aug 2008 (Birthday)

enum
	80 maxcol				// max depth of nested indent

// bitvector array 256 of hexval:hex:sep:space:letter
// for example <9> 1001:1:0:0:1 9*16+9
//
// cLetter 	# 0..9 A..Z _ a..z
// cSpace 	<1>..<32> <128>..<254>
// cSep 	"!%&()*+-/:<=>[]^{|} <0> <EOF_CHAR>
// cHex 	0..9 A..F a..f
// use gen-ctype-s.txt  to generate ctype[.]

ctype = array
	4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 4 4 1 2 4 4 2 4 4 4 4 2 4 2 4 9 25 41 57 73 89 105 121 137 153 4 2 4 4 4 2
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 4 4 1
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4

: isNum c = (c >= 48) & (c <= 57)	// 0..9
: isHex c = ctype[c] & 8
: isSpace c = ctype[c] & 2
: isSep c = ctype[c] & 4
: isLetter c = ctype[c] & 1
: hexval c = (ctype[c] >> 4) & 15

to initlex =
	inbuf = array 1000		// input line buffer
	tok = 0					// current token
	tokvalue = 0			// token value
	tokstring = array 256	// token string
	// column stack for block indentation
	tokcol = 0				// current token column
	colstk = array maxcol	// column stack
	colsp = 0

to pushcol col =
	if colsp >= maxcol
		seterror "lex error col stack overflow"
	else
		colsp = colsp + 1
		colstk[colsp] = col

to popcol =
	if colsp <= 0
		seterror "lex error col stack underflow"
	else
		colsp = colsp - 1

to startlex =
	CLEN = 0				// current token length
	TP = 0					// current token pointer
	line = 0				// current line number
	lexstate = 1			// state of lex
	oldtok = 0				// old token
	colsp = 0				// col stack pointer
	pushcol 0

// get whole word, update CLEN
to token | pos =
	pos = TP
	while isLetter inbuf[pos]
		pos = pos + 1
	CLEN = pos - TP

// read until " or end of line, CLEN whole str ".."
to collectstring | a c =
	a = TP+1
	c = inbuf[a]
	while (c != 34) & (c != 0) // until " or eol
		a = a + 1
		c = inbuf[a]
	CLEN = a - TP + 1

to tokNumber | c i v =
	token
	v = 0
	for i TP TP+CLEN-1
		c = inbuf[i]
		if ! isNum c
			seterror "invalid integer"
		v = v*10 + c - 48
	tokvalue = v
	tok = tkNUMBER

to tokHex | c i h =
	token
	c = inbuf[TP+1]			// check first digit
	if ! isHex c
		seterror "invalid hex"
	h = hexval c
	if h > 7
		h = h - 16			// sign extension
	for i TP+2 TP+CLEN-1
		c = inbuf[i]
		if ! isHex c
			seterror "invalid hex"
		h = h*16 + hexval c
	tokvalue = h
	tok = tkNUMBER

// to terminate a line, pad EOF at end of line
// for interactive mode r-e-p
to padEOF | pos =
	pos = 0
	while inbuf[pos] != 0
		pos = pos + 1
	inbuf[pos] = 255		// EOF_CHAR
	inbuf[pos+1] = 0

// affect CLEN, tok, tokvalue, tokstring. change TP
to lex1 | pos c d =
	pos = TP
	while 1				// read until get a token
		if or (TP == 0)(inbuf[pos] == 0)	// end buffer
			fgets FI inbuf					// get a line
			line = line + 1
			pos = 0
			if cmode == 2 padEOF			// interactive
		while isSpace inbuf[pos]			// skip space
			pos = pos + 1
		c = inbuf[pos]
		d = inbuf[pos+1]					// lookahead
		if c == 0 							// buffer empty
			TP = 0
		else if and (c == 47) (d == 47) 	// "//"
			TP = 0
		else
			break
	TP = pos
	tokcol = pos
	CLEN = 1
	case c
		48: tokNumber		// 0..9
		49: tokNumber
		50: tokNumber
		51: tokNumber
		52: tokNumber
		53: tokNumber
		54: tokNumber
		55: tokNumber
		56: tokNumber
		57: tokNumber
		34:					// " string
			collectstring
			strpack tokstring inbuf TP+1 CLEN-2
			tok = tkSTRING
		35: tokHex			// # hex num
		33: if d == 61		// !=
				CLEN = 2
				tok = tkNE
			else
				tok = tkNOT
		37: tok = tkMOD
		38: tok = tkAND
		40: tok = tkLPAREN
		41: tok = tkRPAREN
		42: tok = tkSTAR
		43: tok = tkPLUS
		45: tok = tkMINUS
		47: tok = tkSLASH
		58: tok = tkCOLON
		60: if d == 60		// <<
				CLEN = 2
				tok = tkLTLT
			else if d == 61 // <=
				CLEN = 2
				tok = tkLE
			else
				tok = tkLT
		61: if d == 61		// ==
				CLEN = 2
				tok = tkEQEQ
			else
				tok = tkEQ
		62: if d == 62		// >>
				CLEN = 2
				tok = tkGTGT
			else if d == 61 // >=
				CLEN = 2
				tok = tkGE
			else
				tok = tkGT
		91: tok = tkLBRACKET // [
		93: tok = tkRBRACKET // ]
		94: tok = tkCARET	 // ^
		123: tok = tkBB		 // {
		124: tok = tkBAR	 // |
		125: tok = tkBE		 // }
		else:
			if c == 255		// EOF
				tok = tkEOF
			else			// it is an indentifier
				token
				strpack tokstring inbuf TP CLEN
//				tokvalue = install tokstring
				tok = tkIDEN
//				if (getType tokvalue) == tyKEY // key word
//					tok = getRef tokvalue
	// end case
	TP = TP + CLEN

to prtoken tk = {}

// lexstate N=1, F=2, B=3
// state description   condition  action    next
//   N  normal  lex     oldline   out tok     N
//   N                  nl & ==   out tok     N
//   N                  nl & >    out { push  F
//   N                  nl & <    out } pop   B
//   F  forward         -         out oldtok  N
//   B  backward        ==        out oldtok  N
//   B                  <         out } pop   B
//   transition occurs on call lex
//   eof is a special case

to lex | oldline =
	case lexstate
		1:	// normal
			oldline = line
			lex1
//          printc 34 prtoken tok printc 34 space
			oldtok = tok
			if tok == tkEOF			// out BE til match
				if colstk[colsp] != 0
					tok = tkBE
					popcol
				break
			if line != oldline		// newline
				if tokcol > colstk[colsp]
					tok = tkBB
					pushcol tokcol
					lexstate = 2	// F
				else if tokcol < colstk[colsp]
					tok = tkBE
					popcol
					lexstate = 3	// B
				// tokcol == colstk, just return tok
		2:	// forward
			tok = oldtok
			lexstate = 1			// N
		3:	// backward
			if tokcol >= colstk[colsp]
				tok = oldtok
				lexstate = 1		// N
			else // if tokcol < colstk[colsp]
				tok = tkBE
				popcol

// original prtoken (in som-v2) is 251 words
// this version using tok_str is 41 words

// print string of token, len 3 each
tok_str = array
	"*" 0 "/" 0 "-" 0 "+" 0 "=" 0 "==" 0 "&" 0 "|" 0
	"^" 0 "%" 0 "!" 0 "!=" 0 "<" 0 "<=" 0 "<<" 0 ">" 0
	">=" 0 ">>" 0 ":" 0 "(" 0 ")" 0 "[" 0 "]" 0 "{" 0
	"}" 0 "to" 0 "if" 0 "else" 0 "while" "for" 0 "break" "array"
	"case" 0 "enum" 0 "syscall"

to prtoken tk | a =
	if tk > tkSYSCALL
		seterror "unknown token"
	if tk >= tkSTAR
		a = tk - tkSTAR		// a 0..34
		prints tok_str+(a*3)
		break
	case tk
		tkIDEN: prints tokstring
		tkNUMBER: print tokvalue
		tkSTRING:
			printc 34 prints tokstring printc 34
		tkEOF: prints "eof"

to testtok  =
	startlex
	lex
	while tok != tkEOF
		prtoken tok space
		lex

to main =
	initlex
	testtok

// End
