Wednesday, April 09, 2014

Lexing in Go

One of my standard exercises when I'm looking at a new language is to implement the lexical scanner for Suneido. I've done this in quite a variety of languages - C++, Java, C#, D, and now Go. The scanner is simple, and generally the implementations are similar.

The Go code is longer (in lines) than most of the other implementations for a couple of reasons. One is that Go doesn't have ?: and you have to use if-else. Another is that gofmt puts enum type constants one per line.

Go only supports simple numeric constants to implement enums. That works ok, but it's awkward for debugging because if you print them, you just get a number.

One interesting thing about the Go implementation is that it handles unicode without really worrying about it too much.

I debated over whether to return the results as a struct or just as multiple return values. But that really depends on which is easier for the calling code.

There is a good talk by Rob Pike about Lexical Scanning in Go. If you're not interested enough to watch the video, you can skim the slides. I didn't need to use his fancier concurrent state machine design but it's an interesting example of using Go. You can see a full implementation in the Go template package.

Here's the code: (or view it on GitHub)

// Package lex implements the lexical scanner for Suneido
package lex
import (
"bytes"
"strings"
"unicode"
"unicode/utf8"
)
type lexer struct {
src string
si int
start int
width int
value string
}
type Item struct {
token Token
keyword Token
value string
}
func Lexer(src string) *lexer {
return &lexer{src: src}
}
func (lxr *lexer) Next() Item {
token := lxr.next()
var value string
if lxr.value != "" {
value = lxr.value
} else {
value = lxr.src[lxr.start:lxr.si]
}
keyword := NIL
if token == IDENTIFIER && lxr.peek() != ':' {
keyword = Keyword(value)
}
return Item{token, keyword, value}
}
func (lxr *lexer) next() Token {
lxr.start = lxr.si
c := lxr.read()
switch c {
case eof:
return EOF
case '#':
return HASH
case '(':
return L_PAREN
case ')':
return R_PAREN
case ',':
return COMMA
case ';':
return SEMICOLON
case '?':
return Q_MARK
case '@':
return AT
case '[':
return L_BRACKET
case ']':
return R_BRACKET
case '{':
return L_CURLY
case '}':
return R_CURLY
case '~':
return BITNOT
case ':':
if lxr.match(':') {
return RANGELEN
} else {
return COLON
}
case '=':
if lxr.match('=') {
return IS
} else {
if lxr.match('~') {
return MATCH
} else {
return EQ
}
}
case '!':
if lxr.match('=') {
return ISNT
} else {
if lxr.match('~') {
return MATCHNOT
} else {
return NOT
}
}
case '<':
if lxr.match('<') {
if lxr.match('=') {
return LSHIFTEQ
} else {
return LSHIFT
}
} else if lxr.match('>') {
return ISNT
} else if lxr.match('=') {
return LTE
} else {
return LT
}
case '>':
if lxr.match('>') {
if lxr.match('=') {
return RSHIFTEQ
} else {
return RSHIFT
}
} else if lxr.match('=') {
return GTE
} else {
return GT
}
case '|':
if lxr.match('|') {
return OR
} else if lxr.match('=') {
return BITOREQ
} else {
return BITOR
}
case '&':
if lxr.match('&') {
return AND
} else if lxr.match('=') {
return BITANDEQ
} else {
return BITAND
}
case '^':
if lxr.match('=') {
return BITXOREQ
} else {
return BITXOR
}
case '-':
if lxr.match('-') {
return DEC
} else if lxr.match('=') {
return SUBEQ
} else {
return SUB
}
case '+':
if lxr.match('+') {
return INC
} else if lxr.match('=') {
return ADDEQ
} else {
return ADD
}
case '/':
if lxr.match('/') {
return lxr.lineComment()
} else if lxr.match('*') {
return lxr.spanComment()
} else if lxr.match('=') {
return DIVEQ
} else {
return DIV
}
case '*':
if lxr.match('=') {
return MULEQ
} else {
return MUL
}
case '%':
if lxr.match('=') {
return MODEQ
} else {
return MOD
}
case '$':
if lxr.match('=') {
return CATEQ
} else {
return CAT
}
case '`':
return lxr.rawString()
case '"':
case '\'':
return lxr.quotedString(c)
case '.':
if lxr.match('.') {
return RANGETO
} else if unicode.IsDigit(lxr.peek()) {
return lxr.number()
} else {
return DOT
}
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return lxr.number()
default:
if isSpace(c) {
return lxr.whitespace(c)
} else if unicode.IsLetter(c) || c == '_' {
return lxr.identifier()
}
}
return ERROR
}
func (lxr *lexer) whitespace(c rune) Token {
result := WHITESPACE
for ; isSpace(c); c = lxr.read() {
if c == '\n' || c == '\r' {
result = NEWLINE
}
}
lxr.backup()
return result
}
func (lxr *lexer) lineComment() Token {
for c := lxr.read(); c != eof && c != '\n'; c = lxr.read() {
}
return COMMENT
}
func (lxr *lexer) spanComment() Token {
lxr.matchUntil(func() bool { return strings.HasSuffix(lxr.src[:lxr.si], "*/") })
return COMMENT
}
func (lxr *lexer) rawString() Token {
for c := lxr.read(); c != eof && c != '`'; c = lxr.read() {
}
lxr.value = lxr.src[lxr.start+1 : lxr.si-1]
return STRING
}
func (lxr *lexer) quotedString(quote rune) Token {
var buf bytes.Buffer
lxr.match(quote)
for c := lxr.read(); c != eof && c != quote; c = lxr.read() {
buf.WriteRune(lxr.doesc(c))
}
lxr.value = buf.String()
return STRING
}
func (lxr *lexer) doesc(c rune) rune {
if c != '\\' {
return c
}
save := lxr.si
c = lxr.read()
switch c {
case 'n':
return '\n'
case 't':
return '\t'
case 'r':
return '\r'
case 'x':
dig1 := digit(lxr.read(), 16)
dig2 := digit(lxr.read(), 16)
if dig1 != -1 && dig2 != -1 {
return rune(16*dig1 + dig2)
}
case '\\':
case '"':
case '\'':
return c
default:
dig1 := digit(lxr.read(), 8)
dig2 := digit(lxr.read(), 8)
dig3 := digit(lxr.read(), 8)
if dig1 != -1 && dig2 != -1 && dig3 != -1 {
return rune(64*dig1 + 8*dig2 + dig3)
}
}
lxr.si = save
return '\\'
}
func digit(c rune, radix int) int {
n := 99
if isDigit(c) {
n = int(c - '0')
} else if isHexDigit(c) {
n = int(10 + unicode.ToLower(c) - 'a')
}
if n < radix {
return n
} else {
return -1
}
}
func isDigit(r rune) bool {
return '0' <= r && r <= '9'
}
func isHexDigit(r rune) bool {
return strings.ContainsRune(hexDigits, r)
}
func (lxr *lexer) number() Token {
lxr.matchOneOf("+-")
// Is it hex?
digits := "0123456789"
if lxr.match('0') && lxr.matchOneOf("xX") {
digits = hexDigits
}
lxr.matchRunOf(digits)
if lxr.match('.') {
lxr.matchRunOf(digits)
}
if lxr.matchOneOf("eE") {
lxr.matchOneOf("+-")
lxr.matchRunOf("0123456789")
}
return NUMBER
}
func (lxr *lexer) identifier() Token {
lxr.matchWhile(isIdentChar)
if !lxr.match('?') {
lxr.match('!')
}
return IDENTIFIER
}
const eof = -1
func (lxr *lexer) read() rune {
if lxr.si >= len(lxr.src) {
lxr.width = 0
return eof
}
c, w := utf8.DecodeRuneInString(lxr.src[lxr.si:])
lxr.si += w
lxr.width = w
return c
}
func (lxr *lexer) backup() {
lxr.si -= lxr.width
}
func (lxr *lexer) peek() rune {
c := lxr.read()
lxr.backup()
return c
}
func (lxr *lexer) match(c rune) bool {
if c == lxr.read() {
return true
}
lxr.backup()
return false
}
func (lxr *lexer) matchOneOf(valid string) bool {
if strings.ContainsRune(valid, lxr.read()) {
return true
}
lxr.backup()
return false
}
func (lxr *lexer) matchRunOf(valid string) {
for strings.ContainsRune(valid, lxr.read()) {
}
lxr.backup()
}
func (lxr *lexer) matchWhile(f func(c rune) bool) {
for c := lxr.read(); f(c); c = lxr.read() {
}
lxr.backup()
}
func (lxr *lexer) matchUntil(f func() bool) {
for c := lxr.read(); c != eof && !f(); c = lxr.read() {
}
}
func isIdentChar(r rune) bool {
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
}
const hexDigits = "0123456789abcdefABCDEF"
func isSpace(c rune) bool {
return c == ' ' || c == '\t' || c == '\r' || c == '\n'
}
view raw lex.go hosted with ❤ by GitHub
package lex
func Keyword(s string) Token {
return keywords[s]
}
func (t Token) IsInfix() bool {
return infix[t]
}
func (t Token) String() string {
return tostring[t]
}
var tostring = map[Token]string{
NIL: "NIL",
EOF: "EOF",
WHITESPACE: "WHITESPACE",
COMMENT: "COMMENT",
NEWLINE: "NEWLINE",
}
type Token int
const (
NIL Token = iota
EOF
ERROR
IDENTIFIER
NUMBER
STRING
WHITESPACE
COMMENT
NEWLINE
// operators and punctuation
HASH
COMMA
COLON
SEMICOLON
Q_MARK
AT
DOT
L_PAREN
R_PAREN
L_BRACKET
R_BRACKET
L_CURLY
R_CURLY
IS
ISNT
MATCH
MATCHNOT
LT
LTE
GT
GTE
NOT
INC
DEC
BITNOT
ADD
SUB
CAT
MUL
DIV
MOD
LSHIFT
RSHIFT
BITOR
BITAND
BITXOR
EQ
ADDEQ
SUBEQ
CATEQ
MULEQ
DIVEQ
MODEQ
LSHIFTEQ
RSHIFTEQ
BITOREQ
BITANDEQ
BITXOREQ
RANGETO
RANGELEN
// language keywords
AND
BOOL
BREAK
BUFFER
CALLBACK
CASE
CATCH
CHAR
CLASS
CONTINUE
CREATE
DEFAULT
DLL
DO
DOUBLE
ELSE
FALSE
FLOAT
FOR
FOREVER
FUNCTION
GDIOBJ
HANDLE
IF
IN
INT64
LONG
NEW
OR
RESOURCE
RETURN
SHORT
STRUCT
SWITCH
SUPER
THIS
THROW
TRUE
TRY
VOID
WHILE
// query keywords
ALTER
AVERAGE
CASCADE
COUNT
DELETE
DROP
ENSURE
EXTEND
HISTORY
INDEX
INSERT
INTERSECT
INTO
JOIN
KEY
LEFTJOIN
LIST
MAX
MIN
MINUS
PROJECT
REMOVE
RENAME
REVERSE
SET
SORT
SUMMARIZE
SVIEW
TIMES
TO
TOTAL
UNION
UNIQUE
UPDATE
UPDATES
VIEW
WHERE
// for AST
ARG
ASSIGNOP
BINARYOP
BLOCK
CALL
DATE
FOR_IN
MEMBER
METHOD
OBJECT
POSTINCDEC
PREINCDEC
RECORD
RVALUE
SELFREF
SUBSCRIPT
SYMBOL
)
var keywords = map[string]Token{
"and": AND,
"bool": BOOL,
"break": BREAK,
"buffer": BUFFER,
"callback": CALLBACK,
"case": CASE,
"catch": CATCH,
"char": CHAR,
"class": CLASS,
"continue": CONTINUE,
"default": DEFAULT,
"dll": DLL,
"do": DO,
"double": DOUBLE,
"else": ELSE,
"false": FALSE,
"float": FLOAT,
"for": FOR,
"forever": FOREVER,
"function": FUNCTION,
"gdiobj": GDIOBJ,
"handle": HANDLE,
"if": IF,
"in": IN,
"int64": INT64,
"is": IS,
"isnt": ISNT,
"long": LONG,
"new": NEW,
"not": NOT,
"or": OR,
"resource": RESOURCE,
"return": RETURN,
"short": SHORT,
"string": STRING,
"struct": STRUCT,
"super": SUPER,
"switch": SWITCH,
"this": THIS,
"throw": THROW,
"true": TRUE,
"try": TRY,
"void": VOID,
"while": WHILE,
"xor": ISNT,
}
var infix = map[Token]bool{
AND: true,
OR: true,
Q_MARK: true,
MATCH: true,
MATCHNOT: true,
ADD: true,
SUB: true,
CAT: true,
MUL: true,
DIV: true,
MOD: true,
LSHIFT: true,
RSHIFT: true,
BITOR: true,
BITAND: true,
BITXOR: true,
}
view raw tokens.go hosted with ❤ by GitHub

3 comments:

Raff said...

The statement "Go only supports simple numeric constants to implement enums. That works ok, but it's awkward for debugging because if you print them, you just get a number." is not completely true.

If you have a "typed" constant you can add a String() method. For example you should be able to do:

func (t Token) String() string {
switch t {
case VOID: return "void"
case SOMETHING: return "something"
...
}
}

Not the best, since it requires extra work, but it's possible.

Andrew McKinlay said...

Yeah, you'll see in the code I do that, although only for some of the values. But it's verbose and a lot of duplication. (!DRY) If you add a token you have to remember to add it to the String method as well. It's not a huge problem, just a nuisance. I'm not sure how'd you'd fix it without adding undesired complexity to Go.

notzippy said...

Something new with go is go generate, may be helpful
http://blog.golang.org/generate