The Go code is longer (in lines) than most of the other implementations for a couple of reasons. One is that Go doesn't have ?: and you have to use if-else. Another is that gofmt puts enum type constants one per line.
Go only supports simple numeric constants to implement enums. That works ok, but it's awkward for debugging because if you print them, you just get a number.
One interesting thing about the Go implementation is that it handles unicode without really worrying about it too much.
I debated over whether to return the results as a struct or just as multiple return values. But that really depends on which is easier for the calling code.
There is a good talk by Rob Pike about Lexical Scanning in Go. If you're not interested enough to watch the video, you can skim the slides. I didn't need to use his fancier concurrent state machine design but it's an interesting example of using Go. You can see a full implementation in the Go template package.
Here's the code: (or view it on GitHub)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Package lex implements the lexical scanner for Suneido | |
package lex | |
import ( | |
"bytes" | |
"strings" | |
"unicode" | |
"unicode/utf8" | |
) | |
type lexer struct { | |
src string | |
si int | |
start int | |
width int | |
value string | |
} | |
type Item struct { | |
token Token | |
keyword Token | |
value string | |
} | |
func Lexer(src string) *lexer { | |
return &lexer{src: src} | |
} | |
func (lxr *lexer) Next() Item { | |
token := lxr.next() | |
var value string | |
if lxr.value != "" { | |
value = lxr.value | |
} else { | |
value = lxr.src[lxr.start:lxr.si] | |
} | |
keyword := NIL | |
if token == IDENTIFIER && lxr.peek() != ':' { | |
keyword = Keyword(value) | |
} | |
return Item{token, keyword, value} | |
} | |
func (lxr *lexer) next() Token { | |
lxr.start = lxr.si | |
c := lxr.read() | |
switch c { | |
case eof: | |
return EOF | |
case '#': | |
return HASH | |
case '(': | |
return L_PAREN | |
case ')': | |
return R_PAREN | |
case ',': | |
return COMMA | |
case ';': | |
return SEMICOLON | |
case '?': | |
return Q_MARK | |
case '@': | |
return AT | |
case '[': | |
return L_BRACKET | |
case ']': | |
return R_BRACKET | |
case '{': | |
return L_CURLY | |
case '}': | |
return R_CURLY | |
case '~': | |
return BITNOT | |
case ':': | |
if lxr.match(':') { | |
return RANGELEN | |
} else { | |
return COLON | |
} | |
case '=': | |
if lxr.match('=') { | |
return IS | |
} else { | |
if lxr.match('~') { | |
return MATCH | |
} else { | |
return EQ | |
} | |
} | |
case '!': | |
if lxr.match('=') { | |
return ISNT | |
} else { | |
if lxr.match('~') { | |
return MATCHNOT | |
} else { | |
return NOT | |
} | |
} | |
case '<': | |
if lxr.match('<') { | |
if lxr.match('=') { | |
return LSHIFTEQ | |
} else { | |
return LSHIFT | |
} | |
} else if lxr.match('>') { | |
return ISNT | |
} else if lxr.match('=') { | |
return LTE | |
} else { | |
return LT | |
} | |
case '>': | |
if lxr.match('>') { | |
if lxr.match('=') { | |
return RSHIFTEQ | |
} else { | |
return RSHIFT | |
} | |
} else if lxr.match('=') { | |
return GTE | |
} else { | |
return GT | |
} | |
case '|': | |
if lxr.match('|') { | |
return OR | |
} else if lxr.match('=') { | |
return BITOREQ | |
} else { | |
return BITOR | |
} | |
case '&': | |
if lxr.match('&') { | |
return AND | |
} else if lxr.match('=') { | |
return BITANDEQ | |
} else { | |
return BITAND | |
} | |
case '^': | |
if lxr.match('=') { | |
return BITXOREQ | |
} else { | |
return BITXOR | |
} | |
case '-': | |
if lxr.match('-') { | |
return DEC | |
} else if lxr.match('=') { | |
return SUBEQ | |
} else { | |
return SUB | |
} | |
case '+': | |
if lxr.match('+') { | |
return INC | |
} else if lxr.match('=') { | |
return ADDEQ | |
} else { | |
return ADD | |
} | |
case '/': | |
if lxr.match('/') { | |
return lxr.lineComment() | |
} else if lxr.match('*') { | |
return lxr.spanComment() | |
} else if lxr.match('=') { | |
return DIVEQ | |
} else { | |
return DIV | |
} | |
case '*': | |
if lxr.match('=') { | |
return MULEQ | |
} else { | |
return MUL | |
} | |
case '%': | |
if lxr.match('=') { | |
return MODEQ | |
} else { | |
return MOD | |
} | |
case '$': | |
if lxr.match('=') { | |
return CATEQ | |
} else { | |
return CAT | |
} | |
case '`': | |
return lxr.rawString() | |
case '"': | |
case '\'': | |
return lxr.quotedString(c) | |
case '.': | |
if lxr.match('.') { | |
return RANGETO | |
} else if unicode.IsDigit(lxr.peek()) { | |
return lxr.number() | |
} else { | |
return DOT | |
} | |
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': | |
return lxr.number() | |
default: | |
if isSpace(c) { | |
return lxr.whitespace(c) | |
} else if unicode.IsLetter(c) || c == '_' { | |
return lxr.identifier() | |
} | |
} | |
return ERROR | |
} | |
func (lxr *lexer) whitespace(c rune) Token { | |
result := WHITESPACE | |
for ; isSpace(c); c = lxr.read() { | |
if c == '\n' || c == '\r' { | |
result = NEWLINE | |
} | |
} | |
lxr.backup() | |
return result | |
} | |
func (lxr *lexer) lineComment() Token { | |
for c := lxr.read(); c != eof && c != '\n'; c = lxr.read() { | |
} | |
return COMMENT | |
} | |
func (lxr *lexer) spanComment() Token { | |
lxr.matchUntil(func() bool { return strings.HasSuffix(lxr.src[:lxr.si], "*/") }) | |
return COMMENT | |
} | |
func (lxr *lexer) rawString() Token { | |
for c := lxr.read(); c != eof && c != '`'; c = lxr.read() { | |
} | |
lxr.value = lxr.src[lxr.start+1 : lxr.si-1] | |
return STRING | |
} | |
func (lxr *lexer) quotedString(quote rune) Token { | |
var buf bytes.Buffer | |
lxr.match(quote) | |
for c := lxr.read(); c != eof && c != quote; c = lxr.read() { | |
buf.WriteRune(lxr.doesc(c)) | |
} | |
lxr.value = buf.String() | |
return STRING | |
} | |
func (lxr *lexer) doesc(c rune) rune { | |
if c != '\\' { | |
return c | |
} | |
save := lxr.si | |
c = lxr.read() | |
switch c { | |
case 'n': | |
return '\n' | |
case 't': | |
return '\t' | |
case 'r': | |
return '\r' | |
case 'x': | |
dig1 := digit(lxr.read(), 16) | |
dig2 := digit(lxr.read(), 16) | |
if dig1 != -1 && dig2 != -1 { | |
return rune(16*dig1 + dig2) | |
} | |
case '\\': | |
case '"': | |
case '\'': | |
return c | |
default: | |
dig1 := digit(lxr.read(), 8) | |
dig2 := digit(lxr.read(), 8) | |
dig3 := digit(lxr.read(), 8) | |
if dig1 != -1 && dig2 != -1 && dig3 != -1 { | |
return rune(64*dig1 + 8*dig2 + dig3) | |
} | |
} | |
lxr.si = save | |
return '\\' | |
} | |
func digit(c rune, radix int) int { | |
n := 99 | |
if isDigit(c) { | |
n = int(c - '0') | |
} else if isHexDigit(c) { | |
n = int(10 + unicode.ToLower(c) - 'a') | |
} | |
if n < radix { | |
return n | |
} else { | |
return -1 | |
} | |
} | |
func isDigit(r rune) bool { | |
return '0' <= r && r <= '9' | |
} | |
func isHexDigit(r rune) bool { | |
return strings.ContainsRune(hexDigits, r) | |
} | |
func (lxr *lexer) number() Token { | |
lxr.matchOneOf("+-") | |
// Is it hex? | |
digits := "0123456789" | |
if lxr.match('0') && lxr.matchOneOf("xX") { | |
digits = hexDigits | |
} | |
lxr.matchRunOf(digits) | |
if lxr.match('.') { | |
lxr.matchRunOf(digits) | |
} | |
if lxr.matchOneOf("eE") { | |
lxr.matchOneOf("+-") | |
lxr.matchRunOf("0123456789") | |
} | |
return NUMBER | |
} | |
func (lxr *lexer) identifier() Token { | |
lxr.matchWhile(isIdentChar) | |
if !lxr.match('?') { | |
lxr.match('!') | |
} | |
return IDENTIFIER | |
} | |
const eof = -1 | |
func (lxr *lexer) read() rune { | |
if lxr.si >= len(lxr.src) { | |
lxr.width = 0 | |
return eof | |
} | |
c, w := utf8.DecodeRuneInString(lxr.src[lxr.si:]) | |
lxr.si += w | |
lxr.width = w | |
return c | |
} | |
func (lxr *lexer) backup() { | |
lxr.si -= lxr.width | |
} | |
func (lxr *lexer) peek() rune { | |
c := lxr.read() | |
lxr.backup() | |
return c | |
} | |
func (lxr *lexer) match(c rune) bool { | |
if c == lxr.read() { | |
return true | |
} | |
lxr.backup() | |
return false | |
} | |
func (lxr *lexer) matchOneOf(valid string) bool { | |
if strings.ContainsRune(valid, lxr.read()) { | |
return true | |
} | |
lxr.backup() | |
return false | |
} | |
func (lxr *lexer) matchRunOf(valid string) { | |
for strings.ContainsRune(valid, lxr.read()) { | |
} | |
lxr.backup() | |
} | |
func (lxr *lexer) matchWhile(f func(c rune) bool) { | |
for c := lxr.read(); f(c); c = lxr.read() { | |
} | |
lxr.backup() | |
} | |
func (lxr *lexer) matchUntil(f func() bool) { | |
for c := lxr.read(); c != eof && !f(); c = lxr.read() { | |
} | |
} | |
func isIdentChar(r rune) bool { | |
return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) | |
} | |
const hexDigits = "0123456789abcdefABCDEF" | |
func isSpace(c rune) bool { | |
return c == ' ' || c == '\t' || c == '\r' || c == '\n' | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package lex | |
func Keyword(s string) Token { | |
return keywords[s] | |
} | |
func (t Token) IsInfix() bool { | |
return infix[t] | |
} | |
func (t Token) String() string { | |
return tostring[t] | |
} | |
var tostring = map[Token]string{ | |
NIL: "NIL", | |
EOF: "EOF", | |
WHITESPACE: "WHITESPACE", | |
COMMENT: "COMMENT", | |
NEWLINE: "NEWLINE", | |
} | |
type Token int | |
const ( | |
NIL Token = iota | |
EOF | |
ERROR | |
IDENTIFIER | |
NUMBER | |
STRING | |
WHITESPACE | |
COMMENT | |
NEWLINE | |
// operators and punctuation | |
HASH | |
COMMA | |
COLON | |
SEMICOLON | |
Q_MARK | |
AT | |
DOT | |
L_PAREN | |
R_PAREN | |
L_BRACKET | |
R_BRACKET | |
L_CURLY | |
R_CURLY | |
IS | |
ISNT | |
MATCH | |
MATCHNOT | |
LT | |
LTE | |
GT | |
GTE | |
NOT | |
INC | |
DEC | |
BITNOT | |
ADD | |
SUB | |
CAT | |
MUL | |
DIV | |
MOD | |
LSHIFT | |
RSHIFT | |
BITOR | |
BITAND | |
BITXOR | |
EQ | |
ADDEQ | |
SUBEQ | |
CATEQ | |
MULEQ | |
DIVEQ | |
MODEQ | |
LSHIFTEQ | |
RSHIFTEQ | |
BITOREQ | |
BITANDEQ | |
BITXOREQ | |
RANGETO | |
RANGELEN | |
// language keywords | |
AND | |
BOOL | |
BREAK | |
BUFFER | |
CALLBACK | |
CASE | |
CATCH | |
CHAR | |
CLASS | |
CONTINUE | |
CREATE | |
DEFAULT | |
DLL | |
DO | |
DOUBLE | |
ELSE | |
FALSE | |
FLOAT | |
FOR | |
FOREVER | |
FUNCTION | |
GDIOBJ | |
HANDLE | |
IF | |
IN | |
INT64 | |
LONG | |
NEW | |
OR | |
RESOURCE | |
RETURN | |
SHORT | |
STRUCT | |
SWITCH | |
SUPER | |
THIS | |
THROW | |
TRUE | |
TRY | |
VOID | |
WHILE | |
// query keywords | |
ALTER | |
AVERAGE | |
CASCADE | |
COUNT | |
DELETE | |
DROP | |
ENSURE | |
EXTEND | |
HISTORY | |
INDEX | |
INSERT | |
INTERSECT | |
INTO | |
JOIN | |
KEY | |
LEFTJOIN | |
LIST | |
MAX | |
MIN | |
MINUS | |
PROJECT | |
REMOVE | |
RENAME | |
REVERSE | |
SET | |
SORT | |
SUMMARIZE | |
SVIEW | |
TIMES | |
TO | |
TOTAL | |
UNION | |
UNIQUE | |
UPDATE | |
UPDATES | |
VIEW | |
WHERE | |
// for AST | |
ARG | |
ASSIGNOP | |
BINARYOP | |
BLOCK | |
CALL | |
DATE | |
FOR_IN | |
MEMBER | |
METHOD | |
OBJECT | |
POSTINCDEC | |
PREINCDEC | |
RECORD | |
RVALUE | |
SELFREF | |
SUBSCRIPT | |
SYMBOL | |
) | |
var keywords = map[string]Token{ | |
"and": AND, | |
"bool": BOOL, | |
"break": BREAK, | |
"buffer": BUFFER, | |
"callback": CALLBACK, | |
"case": CASE, | |
"catch": CATCH, | |
"char": CHAR, | |
"class": CLASS, | |
"continue": CONTINUE, | |
"default": DEFAULT, | |
"dll": DLL, | |
"do": DO, | |
"double": DOUBLE, | |
"else": ELSE, | |
"false": FALSE, | |
"float": FLOAT, | |
"for": FOR, | |
"forever": FOREVER, | |
"function": FUNCTION, | |
"gdiobj": GDIOBJ, | |
"handle": HANDLE, | |
"if": IF, | |
"in": IN, | |
"int64": INT64, | |
"is": IS, | |
"isnt": ISNT, | |
"long": LONG, | |
"new": NEW, | |
"not": NOT, | |
"or": OR, | |
"resource": RESOURCE, | |
"return": RETURN, | |
"short": SHORT, | |
"string": STRING, | |
"struct": STRUCT, | |
"super": SUPER, | |
"switch": SWITCH, | |
"this": THIS, | |
"throw": THROW, | |
"true": TRUE, | |
"try": TRY, | |
"void": VOID, | |
"while": WHILE, | |
"xor": ISNT, | |
} | |
var infix = map[Token]bool{ | |
AND: true, | |
OR: true, | |
Q_MARK: true, | |
MATCH: true, | |
MATCHNOT: true, | |
ADD: true, | |
SUB: true, | |
CAT: true, | |
MUL: true, | |
DIV: true, | |
MOD: true, | |
LSHIFT: true, | |
RSHIFT: true, | |
BITOR: true, | |
BITAND: true, | |
BITXOR: true, | |
} |
3 comments:
The statement "Go only supports simple numeric constants to implement enums. That works ok, but it's awkward for debugging because if you print them, you just get a number." is not completely true.
If you have a "typed" constant you can add a String() method. For example you should be able to do:
func (t Token) String() string {
switch t {
case VOID: return "void"
case SOMETHING: return "something"
...
}
}
Not the best, since it requires extra work, but it's possible.
Yeah, you'll see in the code I do that, although only for some of the values. But it's verbose and a lot of duplication. (!DRY) If you add a token you have to remember to add it to the String method as well. It's not a huge problem, just a nuisance. I'm not sure how'd you'd fix it without adding undesired complexity to Go.
Something new with go is go generate, may be helpful
http://blog.golang.org/generate
Post a Comment