CIT 594 Tokenizer in Scala
Spring 2014, David Matuszek

To refresh my knowledge of Scala, I decided to write the Tokenizer assignment in Scala. In fact, I wrote two versions--one that takes a Reader parameter (as in the assignment), and one that just takes a String.

As in the assignment, the Tokenizer is implemented as a state machine, but with one significant difference. The state machine is represented as a Graph (as in the Graph & Mazes assignment), where the Nodes are states and the values on the edges are functions.

The state machine I used is as follows:

Tokenizer image

Here's the Scala code for the version that tokenizes a String; comments are below the code.

package tokenizer

import graphSc._
import tokenStuff._
import scala.collection.JavaConversions._

/**
 * Tokenizer that takes a String as a parameter.
 * @author David Matuszek
 * @version April 2014
 */
class Tokenizer1(input: String, keywords: java.util.Set[String]) extends Tokenizer {
  
  type Test = Function1[Char, Boolean]
  
  // Necessary state information (boo, hiss)
  var startPosition = 0
  var previousStartPosition = 0
  
  /** Creates a node with the given name. */
  def node(name: String) = new graphSc.Node(name, g)
  
  /** Creates a functional edge from source to destination . */
  def edge(source: graphSc.Node, 
		  test: Char => Boolean,
		  destination: graphSc.Node) {
	  new graphSc.Edge(source, test, destination)
  }

  // Build the graph. Creates Nodes followed by their inpointing Edges.
  
  val g = new graphSc.Graph("Tokenizer")
  
  val ready = node("READY")
  edge(ready, (ch: Char) => ch.isWhitespace && ch != '\n', ready)
  
  val name = node("NAME")
  edge(ready, (Character.isJavaIdentifierStart _), name)
  edge(name, (Character.isJavaIdentifierPart _), name)
  
  val integer = node("INTEGER")
  edge(ready, Character.isDigit _, integer)
  edge(integer, Character.isDigit _, integer)
  
  val dot = node("DOT")
  edge(ready, _ == '.', dot)
  
  val slash = node("SLASH")
  edge(ready, _ == '/', slash)
  
  val symbol = node("SYMBOL") // maybe not needed?
  edge(ready, (ch: Char) =>
    ! ch.isWhitespace && ! Character.isJavaIdentifierStart(ch) &&
    ! Character.isDigit(ch) && ! "./".contains(ch), symbol)
    
  val float = node("FLOAT")
  edge(integer, _ == '.', float)
  edge(dot, Character.isDigit _, float)
  edge(float, Character.isDigit _, float)
  
  val comment = node("COMMENT")
  edge(slash, _ == '/', comment)
  edge(comment, _ != '\n', comment)
  
  val startingExponent = node("STARTING_EXPONENT")
  edge(integer, "Ee" contains _, startingExponent)
  edge(float, "Ee" contains _, startingExponent)
  
  val signedExponent = node("SIGNED_EXPONENT")
  edge(startingExponent, "+-" contains _, signedExponent)
  
  val exponentDigits = node("EXPONENT_DIGITS")
  edge(startingExponent, Character.isDigit _, exponentDigits)
  edge(signedExponent, Character.isDigit _, exponentDigits)
  edge(exponentDigits, Character.isDigit _, exponentDigits)
  
  val number = node("NUMBER")
  edge(integer, "lLfF" contains _, number)
  edge(float, "lLfF" contains _, number)
  
  val eol = node("EOL")
  edge(ready, _ == '\n', eol)
  edge(comment, _ == '\n', eol)
  
  val error = node("ERROR") // currently unused
  edge(error, (ch: Char) => true, error)
 
  /** Follow edges as far as possible. */
  def advance(position: Int, source: Node): (Int, Node) = {
    if (position >= input.length) return (position, source)
    val edges: List[Edge] = source.getOutpointingEdges().toList
    val optEdge = edges.find(_.getValue().asInstanceOf[Test](input(position)))
    optEdge match {
      case Some(edge) => advance(position + 1, edge.getDestination())
      case None => (position, source)
    }
  }
  
  /** Prepare to return the previously returned token again. */
  def pushBack { startPosition = previousStartPosition }
  
  /** Test whether there is another token. */
  def hasNext = !(input
    .substring(startPosition)
    .filter((ch: Char) => !ch.isWhitespace || ch == '\n').isEmpty)

  /** Get and return the next token. */
  def next: Token = {
    val (endPosition, endNode) = advance(startPosition, ready)
    val value = input.slice(startPosition, endPosition).trim
    previousStartPosition = startPosition
    startPosition = endPosition
    endNode.getValue() match {
      case "NAME" =>
        if (keywords.contains(value)) new Token(TokenType.KEYWORD, value)
        else new Token(TokenType.NAME, value)
      case "INTEGER" | "FLOAT" | "EXPONENT_DIGITS" | "NUMBER" =>
        new Token(TokenType.NUMBER, value)
      case "SYMBOL" | "DOT" | "SLASH" => new Token(TokenType.SYMBOL, value)
      case "EOL" => new Token(TokenType.EOL, "\n")
      case "COMMENT" => { startPosition = endPosition + 1; next }
    }
  }
}

Notes on the boring but necessary stuff:

More interesting notes: