lexmachine/README.md
2022-08-29 16:02:15 +08:00

166 lines
4.0 KiB
Markdown
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

## Lexmachine
词法分析词基于开源项目改造实现主要是增加对于utf8文本的行数和列数的计算
### 快速开始
#### 安装
```
go get gitea.xintech.co/zhouzhihong/lexmachine
```
#### 使用列子
```
import (
"fmt"
"strings"
lex "gitea.xintech.co/zhouzhihong/lexmachine"
"gitea.xintech.co/zhouzhihong/lexmachine/machines"
)
var Literals []string // The tokens representing literal strings
var Keywords []string // The keyword tokens
var Tokens []string // All of the tokens (including literals and keywords)
var TokenIds map[string]int // A map from the token names to their int ids
var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner
// Called at package initialization. Creates the lexer and populates token lists.
func init() {
initTokens()
var err error
Lexer, err = initLexer()
if err != nil {
panic(err)
}
}
func initTokens() {
Literals = []string{
"[",
"]",
"{",
"}",
"=",
",",
";",
":",
"->",
"--",
}
Keywords = []string{
"NODE",
"EDGE",
"GRAPH",
"DIGRAPH",
"SUBGRAPH",
"STRICT",
}
Tokens = []string{
"COMMENT",
"ID",
}
Tokens = append(Tokens, Keywords...)
Tokens = append(Tokens, Literals...)
TokenIds = make(map[string]int)
for i, tok := range Tokens {
TokenIds[tok] = i
}
}
// Creates the lexer object and compiles the NFA.
func initLexer() (*lex.Lexer, error) {
lexer := lex.NewLexer()
for _, lit := range Literals {
r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
lexer.Add([]byte(r), token(lit))
}
for _, name := range Keywords {
lexer.Add([]byte(strings.ToLower(name)), token(name))
}
lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
})
lexer.Add([]byte("( |\t|\n|\r)+"), skip)
lexer.Add([]byte(`\<`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
str := make([]byte, 0, 10)
str = append(str, match.Bytes...)
brackets := 1
match.EndLine = match.StartLine
match.EndColumn = match.StartColumn
for tc := scan.TC; tc < len(scan.Text); tc++ {
str = append(str, scan.Text[tc])
match.EndColumn += 1
if scan.Text[tc] == '\n' {
match.EndLine += 1
}
if scan.Text[tc] == '<' {
brackets += 1
} else if scan.Text[tc] == '>' {
brackets -= 1
}
if brackets == 0 {
match.TC = scan.TC
scan.TC = tc + 1
match.Bytes = str
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
}
}
return nil,
fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
match.TC, match.StartLine, match.StartColumn)
},
)
err := lexer.Compile()
if err != nil {
return nil, err
}
return lexer, nil
}
// a lex.Action function which skips the match.
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
return nil, nil
}
// a lex.Action function with constructs a Token of the given token type by
// the token type's name.
func token(name string) lex.Action {
return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
return s.Token(TokenIds[name], string(m.Bytes), m), nil
}
}
s, _ := dot.Lexer.Scanner([]byte(`
digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
strict // asdfa asfwe
/*你好*/ DIGRAPH // asdf`))
for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
fmt.Println(tok, err)
fmt.Printf("(%v,%v)-(%v,%v)\n",
tok.(*lexmachine.Token).TSLine,
tok.(*lexmachine.Token).TSColumn,
tok.(*lexmachine.Token).TELine,
tok.(*lexmachine.Token).TEColumn,
)
}
```
//TODO 错误信息的行列计算