词法分词器框架
cmd | ||
dfa | ||
examples | ||
frontend | ||
inst | ||
lexc | ||
machines | ||
queue | ||
.activate | ||
.gitignore | ||
.gitmodules | ||
doc.go | ||
go.mod | ||
go.sum | ||
grammar | ||
lexer_test.go | ||
lexer.go | ||
README.md |
Lexmachine
词法分析词,基于开源项目改造实现,主要是增加对于utf8文本的行数和列数的计算
快速开始
安装
go get gitea.xintech.co/zhouzhihong/lexmachine
使用列子
import (
"fmt"
"strings"
lex "gitea.xintech.co/zhouzhihong/lexmachine"
"gitea.xintech.co/zhouzhihong/lexmachine/machines"
)
var Literals []string // The tokens representing literal strings
var Keywords []string // The keyword tokens
var Tokens []string // All of the tokens (including literals and keywords)
var TokenIds map[string]int // A map from the token names to their int ids
var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner
// Called at package initialization. Creates the lexer and populates token lists.
func init() {
initTokens()
var err error
Lexer, err = initLexer()
if err != nil {
panic(err)
}
}
func initTokens() {
Literals = []string{
"[",
"]",
"{",
"}",
"=",
",",
";",
":",
"->",
"--",
}
Keywords = []string{
"NODE",
"EDGE",
"GRAPH",
"DIGRAPH",
"SUBGRAPH",
"STRICT",
}
Tokens = []string{
"COMMENT",
"ID",
}
Tokens = append(Tokens, Keywords...)
Tokens = append(Tokens, Literals...)
TokenIds = make(map[string]int)
for i, tok := range Tokens {
TokenIds[tok] = i
}
}
// Creates the lexer object and compiles the NFA.
func initLexer() (*lex.Lexer, error) {
lexer := lex.NewLexer()
for _, lit := range Literals {
r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
lexer.Add([]byte(r), token(lit))
}
for _, name := range Keywords {
lexer.Add([]byte(strings.ToLower(name)), token(name))
}
lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
})
lexer.Add([]byte("( |\t|\n|\r)+"), skip)
lexer.Add([]byte(`\<`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
str := make([]byte, 0, 10)
str = append(str, match.Bytes...)
brackets := 1
match.EndLine = match.StartLine
match.EndColumn = match.StartColumn
for tc := scan.TC; tc < len(scan.Text); tc++ {
str = append(str, scan.Text[tc])
match.EndColumn += 1
if scan.Text[tc] == '\n' {
match.EndLine += 1
}
if scan.Text[tc] == '<' {
brackets += 1
} else if scan.Text[tc] == '>' {
brackets -= 1
}
if brackets == 0 {
match.TC = scan.TC
scan.TC = tc + 1
match.Bytes = str
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
}
}
return nil,
fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
match.TC, match.StartLine, match.StartColumn)
},
)
err := lexer.Compile()
if err != nil {
return nil, err
}
return lexer, nil
}
// a lex.Action function which skips the match.
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
return nil, nil
}
// a lex.Action function with constructs a Token of the given token type by
// the token type's name.
func token(name string) lex.Action {
return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
return s.Token(TokenIds[name], string(m.Bytes), m), nil
}
}
s, _ := dot.Lexer.Scanner([]byte(`
digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
strict // asdfa asfwe
/*你好*/ DIGRAPH // asdf`))
for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
fmt.Println(tok, err)
fmt.Printf("(%v,%v)-(%v,%v)\n",
tok.(*lexmachine.Token).TSLine,
tok.(*lexmachine.Token).TSColumn,
tok.(*lexmachine.Token).TELine,
tok.(*lexmachine.Token).TEColumn,
)
}
// TODO 优化位置标记计算