// token-s.txt
//   lex in som    		    18 Dec 2004
//   public relese som-v2   30 Dec 2004

enum
	80 maxcol

to initlex =
	cLetter = array 8		// bitvec 256 bits
	cSpace = array 8
	cSep = array 8

	inbuf = array 300		// input line buffer
	line = 0				// current line number
	TP = 0					// current token pointer
	CLEN = 0				// current token length
	tok = 0					// current token
	tokvalue = 0			// token value
	tokstring = array 100	// token string

	// column stack for block indentation
	tokcol = 0				// current token column
	colstk = array maxcol	// column stack
	colsp = 0				// stack pointer

	lexstate = 1			// state of lex
	oldtok = 0				// old token

// use bitvector for char type

// set bitvector to 1
to setbitvec bitv c | a =
	a = bitv[ c / 32 ] | (1 << (c % 32))
	bitv[ c / 32 ] = a

// test bitvector
to is1 bitv c =
	bitv[ c / 32 ] & (1 << ( c % 32))

to setbits bitv a b | i =
	for i a b setbitvec bitv i

to initctype | i =
	for i 0 7			// clear bitvec
		cLetter[i] = 0
		cSpace[i] = 0
		cSep[i] = 0

	setbits cLetter 48 57	// 0..9
	setbits cLetter 65 90	// A..Z
	setbits cLetter 97 122	// a..z
//	setbitvec cLetter 36 	// $
//	setbitvec cLetter 63 	// ?
	setbitvec cLetter 95 	// _

	setbits cSpace 1 32		// blank
	setbits cSpace 128 254

	setbitvec cSep 0
	setbits cSep 33 34		// ! "
//	setbitvec cSep 35  		// #
	setbits cSep 37 38		// % &
	setbits cSep 40 43		// ( ) * +
	setbitvec cSep 45  		// -
	setbitvec cSep 47  		// /
	setbitvec cSep 58  		// :
	setbits cSep 60 62		// < = >
	setbitvec cSep 91  		// [
	setbits cSep 93 94		// ] ^
	setbits cSep 123 125	// { | }
	setbitvec cSep 255 		// EOF_CHAR

// return pointer to begin of token and CLEN
// if "//" is found set CLEN = 0
to token | pos a =
	pos = TP
	if (CLEN == 0) | (inbuf[pos] == 0)	// end buffer
		syscall {4 inbuf}	    // get a line
//		nl
		line = line + 1
		pos = 0
	while is1 cSpace inbuf[pos]	// skip space
		pos = pos + 1
	tokcol = pos
	if is1 cSep inbuf[pos]
		CLEN = 1
		if (inbuf[pos]==47) & (inbuf[pos+1]==47) // "//"
			CLEN = 0			// denote blank line
		if inbuf[pos] == 0
			CLEN = 0
	else
		a = pos
		while is1 cLetter inbuf[a]
			a = a + 1
		CLEN = a - pos
	pos

to isNum c =
	(c >= 48) & (c <= 57) // 0..9

// do atoi in an array ar[start] with length len
to aatoi ar start len | i c v =
	v = 0
	for i start start+len-1
		c = ar[i]
		if ! isNum c
			seterror "invalid integer"
		v = v*10 + c - 48
	v

// read until " or end of line, CLEN not count "
to collectstring | a c =
	a = TP+1
	c = inbuf[a]
	while (c != 34) & (c != 0) // until " or eol
		a = a + 1
		c = inbuf[a]
	CLEN = a - TP - 1

// affect CLEN, tok, tokvalue, tokstring. change TP
to lex1 | c d =
	TP = token
	while CLEN == 0		// read until get a token
		TP = token
	c = inbuf[TP]
	if c == 255			// EOF
		tok = tkEOF
		break
	if c == 34   		// " string
		collectstring
		tok = tkSTRING
		strpack tokstring inbuf TP+1 CLEN
		TP = TP + CLEN + 2
		break
	else if isNum c
		tokvalue = aatoi inbuf TP CLEN
		tok = tkNUMBER
	else
		d = inbuf[TP+1]	// lookahead
		case c
			33: if d == 61		// !=
					CLEN = 2
					tok = tkNE
				else
					tok = tkNOT
			37: tok = tkMOD
			38: tok = tkAND
			40: tok = tkLPAREN
			41: tok = tkRPAREN
			42: tok = tkSTAR
			43: tok = tkPLUS
			45: tok = tkMINUS
			47: tok = tkSLASH
			58: tok = tkCOLON
			60: if d == 60		// <<
					CLEN = 2
					tok = tkLTLT
				else if d == 61 // <=
					CLEN = 2
					tok = tkLE
				else
					tok = tkLT
			61: if d == 61		// ==
					CLEN = 2
					tok = tkEQEQ
				else
					tok = tkEQ
			62: if d == 62		// >>
					CLEN = 2
					tok = tkGTGT
				else if d == 61 // >=
					CLEN = 2
					tok = tkGE
				else
					tok = tkGT
			91: tok = tkLBRACKET // [
			93: tok = tkRBRACKET // ]
			94: tok = tkCARET	 // ^
			123: tok = tkBB		 // {
			124: tok = tkBAR	 // |
			125: tok = tkBE		 // }

			else:		// it is a string
				strpack tokstring inbuf TP CLEN
				tokvalue = install tokstring
				tok = tkIDEN
				if (getAttr tokvalue) != 0
					if (getType tokvalue) == tyKEY // key word
						tok = getRef tokvalue
		// end case
	TP = TP + CLEN

to pushcol col =
	if colsp >= maxcol
		seterror "lex error col stack overflow"
	else
		colsp = colsp + 1
		colstk[colsp] = col

to popcol =
	if colsp <= 0
		seterror "lex error col stack underflow"
	else
		colsp = colsp - 1

//to prtoken tk = {}

// lexstate N=1, F=2, B=3
// state description   condition  action    next
//   N  normal  lex     oldline   out tok     N
//   N                  nl & ==   out tok     N
//   N                  nl & >    out { push  F
//   N                  nl & <    out } pop   B
//   F  forward         -         out oldtok  N
//   B  backward        ==        out oldtok  N
//   B                  <         out } pop   B
//   transition occurs on call lex
//   eof is a special case

to lex | oldline =
	case lexstate
		1:	// normal
			oldline = line
			lex1
//			printc 34 prtoken tok printc 34 space
			oldtok = tok
			if tok == tkEOF			// out BE til match
				if colstk[colsp] != 0
					tok = tkBE
					popcol
				break
			if line != oldline		// newline
				if tokcol > colstk[colsp]
					tok = tkBB
					pushcol tokcol
					lexstate = 2	// F
				else if tokcol < colstk[colsp]
					tok = tkBE
					popcol
					lexstate = 3	// B
				// tokcol == colstk, just return tok
		2:	// forward
			tok = oldtok
			lexstate = 1			// N
		3:	// backward
			if tokcol >= colstk[colsp]
				tok = oldtok
				lexstate = 1		// N
			else if tokcol < colstk[colsp]
				tok = tkBE
				popcol

to startlex =
	initctype
	TP = 0
	CLEN = 0
	pushcol 0

to prtoken tk =
	case tk
		tkSTAR: printc 42 	// "*"
		tkSLASH: printc 47 	// "/"
		tkMINUS: printc 45	// "-"
		tkPLUS: printc 43	// "+"
		tkEQ: printc 61		// "="
		tkEQEQ: prints "=="
		tkAND: printc 38	// "&"
		tkBAR: printc 124	// "|"
		tkCARET: printc 94	// "^"
		tkMOD: printc 37	// "%"
		tkNOT: printc 33	// "!"
		tkNE: prints "!="
		tkLT: printc 60		// "<"
		tkLE: prints "<="
		tkLTLT: prints "<<"
		tkGT: printc 62		// ">"
		tkGE: prints ">="
		tkGTGT: prints ">>"
		tkCOLON: printc 58	// ":"
		tkLPAREN: printc 40 // "("
		tkRPAREN: printc 41 // ")"
		tkLBRACKET: printc 91 // "["
		tkRBRACKET: printc 93 // "]"
		tkBB: printc 123 	// "{"
		tkBE: printc 125	// "}"
		tkTO: prints "to"
		tkIF: prints "if"
		tkELSE: prints "else"
		tkWHILE: prints "while"
		tkFOR: prints "for"
		tkBREAK: prints "break"
		tkARRAY: prints "array"
		tkCASE: prints "case"
		tkENUM: prints "enum"
		tkSYSCALL: prints "syscall"
		tkIDEN: prints tokstring
		tkNUMBER: print tokvalue
		tkSTRING:
			printc 34 prints tokstring printc 34
		tkEOF: prints "eof"

// print atom
to pratom type ref =
	case type
		SP:
			prints "NIL"
		OPER:
			prtoken ref
		NUM:
			print ref
		GNAME:
			prints getName ref
		LNAME:
	   		printc 35 print ref // #
		CNAME:
			prints "classname"
		STRING:
			printc 34 prints ref printc 34

// print list
to prlist a =
	if a == NIL break
	if isatom a
		pratom (car a) (cdr a)
		space
	else
		printc 40   // "("
		while a != NIL
			prlist car a
			a = cdr a
		printc 41   // ")"

// End
