166 lines
4.0 KiB
Markdown
166 lines
4.0 KiB
Markdown
## Lexmachine
|
||
词法分析词,基于开源项目改造实现,主要是增加对于utf8文本的行数和列数的计算
|
||
|
||
### 快速开始
|
||
|
||
#### 安装
|
||
```
|
||
go get gitea.xintech.co/zhouzhihong/lexmachine
|
||
```
|
||
|
||
#### 使用列子
|
||
```
|
||
import (
|
||
"fmt"
|
||
"strings"
|
||
|
||
lex "gitea.xintech.co/zhouzhihong/lexmachine"
|
||
"gitea.xintech.co/zhouzhihong/lexmachine/machines"
|
||
)
|
||
|
||
var Literals []string // The tokens representing literal strings
|
||
var Keywords []string // The keyword tokens
|
||
var Tokens []string // All of the tokens (including literals and keywords)
|
||
var TokenIds map[string]int // A map from the token names to their int ids
|
||
var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner
|
||
|
||
// Called at package initialization. Creates the lexer and populates token lists.
|
||
func init() {
|
||
initTokens()
|
||
var err error
|
||
Lexer, err = initLexer()
|
||
if err != nil {
|
||
panic(err)
|
||
}
|
||
}
|
||
|
||
func initTokens() {
|
||
Literals = []string{
|
||
"[",
|
||
"]",
|
||
"{",
|
||
"}",
|
||
"=",
|
||
",",
|
||
";",
|
||
":",
|
||
"->",
|
||
"--",
|
||
}
|
||
Keywords = []string{
|
||
"NODE",
|
||
"EDGE",
|
||
"GRAPH",
|
||
"DIGRAPH",
|
||
"SUBGRAPH",
|
||
"STRICT",
|
||
}
|
||
Tokens = []string{
|
||
"COMMENT",
|
||
"ID",
|
||
}
|
||
Tokens = append(Tokens, Keywords...)
|
||
Tokens = append(Tokens, Literals...)
|
||
TokenIds = make(map[string]int)
|
||
for i, tok := range Tokens {
|
||
TokenIds[tok] = i
|
||
}
|
||
}
|
||
|
||
// Creates the lexer object and compiles the NFA.
|
||
func initLexer() (*lex.Lexer, error) {
|
||
lexer := lex.NewLexer()
|
||
|
||
for _, lit := range Literals {
|
||
r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
|
||
lexer.Add([]byte(r), token(lit))
|
||
}
|
||
for _, name := range Keywords {
|
||
lexer.Add([]byte(strings.ToLower(name)), token(name))
|
||
}
|
||
|
||
lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
|
||
lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
|
||
lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
|
||
lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
|
||
lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
|
||
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
|
||
x, _ := token("ID")(scan, match)
|
||
t := x.(*lex.Token)
|
||
v := t.Value.(string)
|
||
t.Value = v[1 : len(v)-1]
|
||
return t, nil
|
||
})
|
||
lexer.Add([]byte("( |\t|\n|\r)+"), skip)
|
||
lexer.Add([]byte(`\<`),
|
||
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
|
||
str := make([]byte, 0, 10)
|
||
str = append(str, match.Bytes...)
|
||
brackets := 1
|
||
match.EndLine = match.StartLine
|
||
match.EndColumn = match.StartColumn
|
||
for tc := scan.TC; tc < len(scan.Text); tc++ {
|
||
str = append(str, scan.Text[tc])
|
||
match.EndColumn += 1
|
||
if scan.Text[tc] == '\n' {
|
||
match.EndLine += 1
|
||
}
|
||
if scan.Text[tc] == '<' {
|
||
brackets += 1
|
||
} else if scan.Text[tc] == '>' {
|
||
brackets -= 1
|
||
}
|
||
if brackets == 0 {
|
||
match.TC = scan.TC
|
||
scan.TC = tc + 1
|
||
match.Bytes = str
|
||
x, _ := token("ID")(scan, match)
|
||
t := x.(*lex.Token)
|
||
v := t.Value.(string)
|
||
t.Value = v[1 : len(v)-1]
|
||
return t, nil
|
||
}
|
||
}
|
||
return nil,
|
||
fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
|
||
match.TC, match.StartLine, match.StartColumn)
|
||
},
|
||
)
|
||
|
||
err := lexer.Compile()
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
return lexer, nil
|
||
}
|
||
|
||
// a lex.Action function which skips the match.
|
||
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
|
||
return nil, nil
|
||
}
|
||
|
||
// a lex.Action function with constructs a Token of the given token type by
|
||
// the token type's name.
|
||
func token(name string) lex.Action {
|
||
return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
|
||
return s.Token(TokenIds[name], string(m.Bytes), m), nil
|
||
}
|
||
}
|
||
|
||
s, _ := dot.Lexer.Scanner([]byte(`
|
||
digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
|
||
strict // asdfa asfwe
|
||
/*你好*/ DIGRAPH // asdf`))
|
||
|
||
for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
|
||
fmt.Println(tok, err)
|
||
fmt.Printf("(%v,%v)-(%v,%v)\n",
|
||
tok.(*lexmachine.Token).TSLine,
|
||
tok.(*lexmachine.Token).TSColumn,
|
||
tok.(*lexmachine.Token).TELine,
|
||
tok.(*lexmachine.Token).TEColumn,
|
||
)
|
||
}
|
||
```
|
||
|
||
// TODO 优化位置标记计算 |