// lexer.txt    lexical analyser for Som
//
//    28 Oct 2009
//  string-s.txt
//  compile-h-s.txt
//  token-s.txt

//  when run against itself, token count is 2120
//  > som5 lexer.txt
//  > som5 -x lexer.obj  (noi 212746) (if noout 162257)

// string-s.txt   string functions
//    som-string is an array of int
//    an int contains at most 4 char, right pad with 0
//    terminate by int 0

//    public release som v3.0   5 March 2007 (Maka-bucha day)
//    public release som v3.1 	19 Aug 2007 (Draft vote day)
//    public release som v4.0   2 July 2008
//    public release som v4.1   9 Aug 2008 (Birthday)

//    improve efficiency by nested-if  29 July 2008

// copy s1 = s2
to strcpy s1 s2 | i =
	i = 0
	while s2[i] != 0
		s1[i] = s2[i]
		i = i + 1
	s1[i] = 0

to strlen s | k =
	k = 0
	while s[k] != 0
		k = k + 1
	k

// debug per "som-string-bug.txt, 7/9/2007" 16 Mar 2008
to streq s1 s2 | i =
	i = 0
	while and (s1[i] == s2[i]) s1[i]
		i = i + 1
	s1[i] == s2[i]

// s1 is som-string (packed string)
// s1 contains only number (no sign)
to atoi s1 | a i c1 c2 c3 c4 v =
	v = 0
	i = 0
	a = s1[i]
	while a != 0
		c1 = (a >> 24) & 255
		if c1 != 0
			v = v*10 + c1 - 48
			c2 = (a >> 16) & 255
			if c2 != 0
				v = v*10 + c2 - 48
				c3 = (a >> 8) & 255
				if c3 != 0
					v = v*10 + c3 - 48
					c4 = a & 255
					if c4 != 0
						v = v*10 + c4 - 48
		i = i + 1
		a = s1[i]
	v

// print string s1, s1 is som-string
// use nested if
to fprints fo s1 | a i c =
	i = 0
	a = s1[i]
	while a != 0
		c = (a >> 24) & 255		// left most
		if c != 0
			fprintc fo c
			c = (a >> 16) & 255
			if c != 0
				fprintc fo c
				c = (a >> 8) & 255
				if c != 0
					fprintc fo c
					c = a & 255
					if c != 0
						fprintc fo c
		i = i + 1
		a = s1[i]

: prints s1 = fprints 1 s1		// prints to stdout

// pack array of char to som-string s1
// use nested if
to strpack s1 ar start len | a k i e =
	k = 0
	i = start
	e = start + len
	while i < e
		a = ar[i] << 24
		i = i + 1
		if i < e
			a = a | (ar[i] << 16)
			i = i + 1
			if i < e
				a = a | (ar[i] << 8)
				i = i + 1
				if i < e
					a = a | ar[i]
					i = i + 1
		s1[k] = a
		k = k + 1
	s1[k] = 0

// End

// compile-h-s.txt
// header file for som-in-som project
//  public release som-v24 10 January 2007
//    Celebrating H.M.King 60 years accend to the throne.
//  public release som v3.0   5 March 2007 (Maka-bucha day)
//  public release som v3.1   19 Aug 2007  (Draft vote day)
//  public release som v4.0   2 July 2008
//  public release som v4.1   9 Aug 2008 (Birthday)

// sequence of loading files
//
// lib.som
// string-s.txt
// compile-h-s.txt
// list-s.txt
// symtab-s.txt
// token-s.txt

enum	// marker
	0 NIL MARK BMARK
//enum	// atom
//	0 SP OPER NUM GNAME LNAME CNAME STRING ADS
//enum	// u-code
//	1 icAdd icSub icMul icDiv icBand
//	icBor icBxor icMod icEq icNe
//	icLt icLe icGt icGe icShl
//	icShr icAddi icSubi icBandi icBori
//	icEqi icLti icLei icShli icShri
//	icInc icDec icLit icSys icNot
//	icPush icGet icPut icLd icSt
//	icLdxa icLdya icCall icRet icCallt
//	icJmp icJt icJf icCase icLdx
//	icStx icLdy icSty icJle icFun  // 50

enum
	50 EOC

enum	// token type
	14 tkIDEN tkNUMBER tkSTRING tkEOF tkERROR
enum	// token
	50 tkSTAR tkSLASH tkMINUS tkPLUS tkEQ tkEQEQ
	tkAND tkBAR tkCARET tkMOD tkNOT tkNE tkLT tkLE
	tkLTLT tkGT tkGE tkGTGT tkCOLON tkLPAREN tkRPAREN
	tkLBRACKET tkRBRACKET tkBB tkBE tkTO tkIF tkELSE
	tkWHILE tkFOR tkBREAK tkARRAY tkCASE tkENUM tkSYSCALL
enum	// symbol type
	6 tyFUNC tyLOCAL tyGVAR tyENUM tyKEY tyNEW tyMAC
enum	// denote gvar arg for DS relocation
	0 VSCALAR VARRAY VSTRING
enum
	5678941 OBJ_SOM41
enum	// bop type for gencode
	50 xLV xGV xLIT xEX xCM xLG xUD
enum	// op format
	60 mV mG mC mJ mW mY mF

//  memory model
//  M [ MAXMEM  ]		 declared in C
//  M[1..MAXSYS]  		 system area
//	CS[MAXCS]			 code segment
//       ^
//       CP
//  ystack[MAXYSTK]		 parser stack
//           ^
//          ysp
//  cell[MAXCELL]		 store list (parse-tree)
//  data segment is absolute M[]

//enum
//	20000 MAXCS		// size of code segment
//enum
//	1000 MACSIZE	// size of macro
//enum
//	2000 MAXYSTK	// size of parser stack
//enum
//	200  MAXCALL	// size of call forward list
//enum
//	100  MAXLV		// size of lv table

//  system area  1 .. MAXSYS-1
enum
	101 mode_ads

// never use in lexer
to initSom | a =
	M = 0				// base of M, alias with C M[]
	CP = 6  			// code pointer
	CS = array MAXCS
	ystack = array MAXYSTK  // parser stack
	ysp = 0
	currentf = 0		// current fn idx
	nenum = 0			// enum number
	verbose = 1			// control compiler message
	cmode = 0			// compile mode, 0 def, 1 im, 2 rep
	callis = array MAXCALL	// list of call forward
	CPX = MAXCS-MACSIZE	// immediate & macro code area
	exsym = array 500 	// export symbol
	nxsym = 0
	line = 0
	FI = 0				// input file
	FO = 1				// output file
	vrec = array MAXLV
	vsp = 0				// vrec sp
	jplis = array 1000	// ads of icJmp
	relis = array (MAXCS/4) // reloc list
	lastCP = 0

: fspace = fprintc FO 32
: fnl = fprintc FO 10

// when no line number is available (gencode, eval, etc.)
to error x =
	prints "error: " prints x nl
	exit

// error during parsing
to seterror s =
	prints "line " print line space
	prints s nl
	exit

to warning nm mess =
	if verbose
		prints "Warning: "
		if nm != 0
			prints nm space
		prints mess nl

// End

// token-s.txt
//   public release som v3.0   5 March 2007 (Maka-bucha day)
//   public release som v3.1   19 Aug 2007 (Draft vote day)
//   public release som v4.0   2 July 2008
//   public release som v4.1   9 Aug 2008 (Birthday)

enum
	80 maxcol				// max depth of nested indent

// bitvector array 256 of hexval:hex:sep:space:letter
// for example <9> 1001:1:0:0:1 9*16+9
//
// cLetter 	# 0..9 A..Z _ a..z
// cSpace 	<1>..<32> <128>..<254>
// cSep 	"!%&()*+-/:<=>[]^{|} <0> <EOF_CHAR>
// cHex 	0..9 A..F a..f
// use gen-ctype-s.txt  to generate ctype[.]

ctype = array
	4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 4 4 1 2 4 4 2 4 4 4 4 2 4 2 4 9 25 41 57 73 89 105 121 137 153 4 2 4 4 4 2
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 2 4 4 1
	2 169 185 201 217 233 249 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4

: isNum c = (c >= 48) & (c <= 57)	// 0..9
: isHex c = ctype[c] & 8
: isSpace c = ctype[c] & 2
: isSep c = ctype[c] & 4
: isLetter c = ctype[c] & 1
: hexval c = (ctype[c] >> 4) & 15

to initlex =
	inbuf = array 1000		// input line buffer
	tok = 0					// current token
	tokvalue = 0			// token value
	tokstring = array 256	// token string
	// column stack for block indentation
	tokcol = 0				// current token column
	colstk = array maxcol	// column stack
	colsp = 0

to pushcol col =
	if colsp >= maxcol
		seterror "lex error col stack overflow"
	else
		colsp = colsp + 1
		colstk[colsp] = col

to popcol =
	if colsp <= 0
		seterror "lex error col stack underflow"
	else
		colsp = colsp - 1

to startlex =
	CLEN = 0				// current token length
	TP = 0					// current token pointer
	line = 0				// current line number
	lexstate = 1			// state of lex
	oldtok = 0				// old token
	colsp = 0				// col stack pointer
	pushcol 0

// get whole word, update CLEN
to token | pos =
	pos = TP
	while isLetter inbuf[pos]
		pos = pos + 1
	CLEN = pos - TP

// read until " or end of line, CLEN whole str ".."
to collectstring | a c =
	a = TP+1
	c = inbuf[a]
	while (c != 34) & (c != 0) // until " or eol
		a = a + 1
		c = inbuf[a]
	CLEN = a - TP + 1

to tokNumber | c i v =
	token
	v = 0
	for i TP TP+CLEN-1
		c = inbuf[i]
		if ! isNum c
			seterror "invalid integer"
		v = v*10 + c - 48
	tokvalue = v
	tok = tkNUMBER

to tokHex | c i h =
	token
	c = inbuf[TP+1]			// check first digit
	if ! isHex c
		seterror "invalid hex"
	h = hexval c
	if h > 7
		h = h - 16			// sign extension
	for i TP+2 TP+CLEN-1
		c = inbuf[i]
		if ! isHex c
			seterror "invalid hex"
		h = h*16 + hexval c
	tokvalue = h
	tok = tkNUMBER

// to terminate a line, pad EOF at end of line
// for interactive mode r-e-p
to padEOF | pos =
	pos = 0
	while inbuf[pos] != 0
		pos = pos + 1
	inbuf[pos] = 255		// EOF_CHAR
	inbuf[pos+1] = 0

// affect CLEN, tok, tokvalue, tokstring. change TP
to lex1 | pos c d =
	pos = TP
	while 1				// read until get a token
		if or (TP == 0)(inbuf[pos] == 0)	// end buffer
			fgets FI inbuf					// get a line
			line = line + 1
			pos = 0
			if cmode == 2 padEOF			// interactive
		while isSpace inbuf[pos]			// skip space
			pos = pos + 1
		c = inbuf[pos]
		d = inbuf[pos+1]					// lookahead
		if c == 0 							// buffer empty
			TP = 0
		else if and (c == 47) (d == 47) 	// "//"
			TP = 0
		else
			break
	TP = pos
	tokcol = pos
	CLEN = 1
	case c
		48: tokNumber		// 0..9
		49: tokNumber
		50: tokNumber
		51: tokNumber
		52: tokNumber
		53: tokNumber
		54: tokNumber
		55: tokNumber
		56: tokNumber
		57: tokNumber
		34:					// " string
			collectstring
			strpack tokstring inbuf TP+1 CLEN-2
			tok = tkSTRING
		35: tokHex			// # hex num
		33: if d == 61		// !=
				CLEN = 2
				tok = tkNE
			else
				tok = tkNOT
		37: tok = tkMOD
		38: tok = tkAND
		40: tok = tkLPAREN
		41: tok = tkRPAREN
		42: tok = tkSTAR
		43: tok = tkPLUS
		45: tok = tkMINUS
		47: tok = tkSLASH
		58: tok = tkCOLON
		60: if d == 60		// <<
				CLEN = 2
				tok = tkLTLT
			else if d == 61 // <=
				CLEN = 2
				tok = tkLE
			else
				tok = tkLT
		61: if d == 61		// ==
				CLEN = 2
				tok = tkEQEQ
			else
				tok = tkEQ
		62: if d == 62		// >>
				CLEN = 2
				tok = tkGTGT
			else if d == 61 // >=
				CLEN = 2
				tok = tkGE
			else
				tok = tkGT
		91: tok = tkLBRACKET // [
		93: tok = tkRBRACKET // ]
		94: tok = tkCARET	 // ^
		123: tok = tkBB		 // {
		124: tok = tkBAR	 // |
		125: tok = tkBE		 // }
		else:
			if c == 255		// EOF
				tok = tkEOF
			else			// it is an indentifier
				token
				strpack tokstring inbuf TP CLEN
//				tokvalue = install tokstring
				tok = tkIDEN
//				if (getType tokvalue) == tyKEY // key word
//					tok = getRef tokvalue
	// end case
	TP = TP + CLEN

to prtoken tk = {}

// lexstate N=1, F=2, B=3
// state description   condition  action    next
//   N  normal  lex     oldline   out tok     N
//   N                  nl & ==   out tok     N
//   N                  nl & >    out { push  F
//   N                  nl & <    out } pop   B
//   F  forward         -         out oldtok  N
//   B  backward        ==        out oldtok  N
//   B                  <         out } pop   B
//   transition occurs on call lex
//   eof is a special case

to lex | oldline =
	case lexstate
		1:	// normal
			oldline = line
			lex1
//          printc 34 prtoken tok printc 34 space
			oldtok = tok
			if tok == tkEOF			// out BE til match
				if colstk[colsp] != 0
					tok = tkBE
					popcol
				break
			if line != oldline		// newline
				if tokcol > colstk[colsp]
					tok = tkBB
					pushcol tokcol
					lexstate = 2	// F
				else if tokcol < colstk[colsp]
					tok = tkBE
					popcol
					lexstate = 3	// B
				// tokcol == colstk, just return tok
		2:	// forward
			tok = oldtok
			lexstate = 1			// N
		3:	// backward
			if tokcol >= colstk[colsp]
				tok = oldtok
				lexstate = 1		// N
			else // if tokcol < colstk[colsp]
				tok = tkBE
				popcol

// original prtoken (in som-v2) is 251 words
// this version using tok_str is 41 words

// print string of token, len 3 each
tok_str = array
	"*" 0 "/" 0 "-" 0 "+" 0 "=" 0 "==" 0 "&" 0 "|" 0
	"^" 0 "%" 0 "!" 0 "!=" 0 "<" 0 "<=" 0 "<<" 0 ">" 0
	">=" 0 ">>" 0 ":" 0 "(" 0 ")" 0 "[" 0 "]" 0 "{" 0
	"}" 0 "to" 0 "if" 0 "else" 0 "while" "for" 0 "break" "array"
	"case" 0 "enum" 0 "syscall"

to prtoken tk | a =
	if tk > tkSYSCALL
		seterror "unknown token"
	if tk >= tkSTAR
		a = tk - tkSTAR		// a 0..34
		prints tok_str+(a*3)
		break
	case tk
		tkIDEN: prints tokstring
		tkNUMBER: print tokvalue
		tkSTRING:
			printc 34 prints tokstring printc 34
		tkEOF: prints "eof"

to testtok | cnt =
	FI = fopen "lexer.txt" 0
	cnt = 0
	startlex
	lex
	while tok != tkEOF
		if verbose
			prtoken tok space
		lex
		cnt = cnt + 1
	fclose FI
	print cnt nl

to main =
	initlex
	verbose = 1
	testtok

// End

