diff --git a/sources/zweic/Scanner.scala b/sources/zweic/Scanner.scala new file mode 100755 index 0000000..2c35af0 --- /dev/null +++ b/sources/zweic/Scanner.scala @@ -0,0 +1,284 @@ +/* zweic -- a compiler for zwei + * + * Stephane Micheloud & LAMP + * + * $Id$ + */ + +package zweic; + +import java.io.{InputStream, IOException}; + + +/** + * This class implements the scanner of the zwei compiler. + */ +class Scanner(in: InputStream) { + import scala.collection.mutable.HashMap; + import Tokens._; + + /** + * This character represents the end of the input. + */ + private val EOF_CH = java.lang.Character.MAX_VALUE; + + /** + * A table that associates a keyword with its corresponding + * token class. + */ + private val keywords = new HashMap[String, Token](); + + keywords("true") = TRUE; + keywords("false") = FALSE; + keywords("this") = THIS; + keywords("Null") = NULLTYPE; + keywords("null") = NULLFACTOR; + + keywords("readInt") = READINT; + keywords("readChar") = READCHAR; + keywords("printInt") = PRINTINT; + keywords("printChar") = PRINTCHAR; + + keywords("val") = VAL; + keywords("Int") = INT; + keywords("def") = DEF; + keywords("set") = SET; + keywords("var") = VAR; + keywords("class") = CLASS; + keywords("new") = NEW; + keywords("extends") = EXTENDS; + keywords("return") = RETURN; + + keywords("and") = AND; + keywords("or") = OR; + keywords("if") = IF; + keywords("else") = ELSE; + keywords("do") = DO; + keywords("while") = WHILE; + + /** The current token class. + */ + var token: Token = BAD; + + /** The position of the first character of the current token. + */ + var start: Int = 0; + + /** + * The string representation of the current token. This variable + * is only set if the current token class has several possible + * textual representations. + */ + var chars: String = _; + + /** + * A buffer for constructing string representations of tokens. + */ + private val buf = new StringBuffer(); + + /** + * The current character. + */ + private var ch = ' '; + + /** + * The line and the column of the current token. + */ + private var line = 1; + private var column = 0; + + nextCh; + nextToken; + + /** + * This method reads the next token and stores the token class + * in variable 'token'. If the token representation is not unique + * it will also leave a textual representation in variable 'chars'. + * Whitespaces and comments are skipped by this method. + */ + def nextToken: Unit = { + // initialize the position of the current token + buf.setLength (0); + start = Position.encode(line, column); + + // skip comments and whitespace characters + while (Character.isWhitespace (ch) || ch == '/') { + if (ch == '/'){ + nextCh; + if (ch == '/') { + // comment + while (ch != '\n' && ch != EOF_CH) { + nextCh; + } + } else { + // division + token = DIV; + return + } + } else { + // whitespace + nextCh; + } + } + + // read the current token + token = readToken; + } + + /** + * Read the next token, store its representation (if its not + * unique) in variable 'chars' and return the token class. + */ + private def readToken: Token = ch match { + + case '(' => nextCh; return LPAREN; + case ')' => nextCh; return RPAREN; + case '{' => nextCh; return LACCOLADE; + case '}' => nextCh; return RACCOLADE; + + case '-' => nextCh; return SUB; + case '+' => nextCh; return ADD; + case '*' => nextCh; return MUL; + case '%' => nextCh; return MOD; + case '/' => nextCh; return DIV; + + case ';' => nextCh; return SEMICOLON; + case ',' => nextCh; return PERIOD; + case '.' => nextCh; return DOT; + case ':' => nextCh; return COLON; + + case '!' => nextCh; + if (ch == '=') { + nextCh; + return NE; + } else return NOT; + + case '=' => nextCh; + if (ch == '=') { + nextCh; + return EQ; + } else return EQUALS; + + case '<' => nextCh; + if (ch == '=') { + nextCh; + return LE; + } else return LT; + + case '>' => nextCh; + if (ch == '=') { + nextCh; + return GE; + } else return GT; + + case '&' => nextCh; + if (ch == '&') { + nextCh; + return AND; + } else { + Report.fail(start, "error: parse error on '&' token"); + return BAD; + } + + case '|' => nextCh; + if (ch == '|') { + nextCh; + return OR; + } else { + Report.fail(start, "error: parse error on '|' token"); + return BAD; + } + + case '"' => nextCh; + while(ch!='"' && ch!='\n' && ch!='\r' && ch!=EOF_CH) { + buf.append(ch); + nextCh; + } + if (ch=='"') { + chars = buf.toString(); + nextCh; + return STRING; + } else { + if (ch == EOF_CH) + Report.fail(start, "error: unexpected EOF"); + else { + nextCh; + Report.fail(start, "error: parse error on '"+ch+"' token. expected \""); + } + return BAD; + } + + case _ if (Character.isLetter (ch)) => + while(Character.isLetter(ch) || Character.isDigit(ch) || ch == '_') { + buf.append(ch); + nextCh; + } + chars = buf.toString (); + keywords.get(chars) match { + case Some(t) => return t; + case _ => return IDENT; + } + + case _ if (Character.isDigit (ch)) => + if (ch == '0') { + buf.append (ch); + nextCh; + } else { + while(Character.isDigit(ch)) { + buf.append(ch); + nextCh; + } + } + chars = buf.toString (); + NUMBER; + + case EOF_CH => + EOF; + + case _ => + nextCh; + Report.fail(start, "error: parse error on '"+ch+"' token"); + return BAD; + } + + /** + * Returns a textual representation of the current token. + */ + def representation = { + val representation = token.toString(); + if (token == NUMBER || token == IDENT || token == STRING) + representation + "(\"" + chars + "\")"; + else + representation + } + + /** + * Puts the next character into 'ch' and updates the current position. + */ + private def nextCh: Unit = { + ch match { + case EOF_CH => + return + case '\n' => + column = 1; + line = line + 1 + case _ => + column = column + 1 + } + try { + ch = readCh; + oldch = if ((oldch == '\r') && (ch == '\n')) readCh else ch; + ch = if (oldch == '\r') '\n' else oldch + } + catch { + case e: IOException => Report.fail(start, e.getMessage()); + } + } + + private def readCh: Char = { + val c = in.read(); + if (c < 0) EOF_CH else c.asInstanceOf[Char] + } + + private var oldch: Char = ' '; +}