Compare commits

...

4 Commits

Author SHA1 Message Date
62b949dd83 Merge remote-tracking branch 'origin/dev' 2022-08-28 22:51:25 +08:00
zhouzhihong
95e26e7aae Merge branch 'dev' of ssh://gitea.xintech.co:2222/zhouzhihong/lexmachine into dev 2022-08-26 14:49:21 +08:00
zhouzhihong
7d2133ac25 Update doc 2022-08-26 14:48:17 +08:00
d2362be35b Update token string func. 2022-08-25 21:49:42 +08:00
3 changed files with 180 additions and 116 deletions

166
README.md Normal file
View File

@ -0,0 +1,166 @@
## Lexmachine
词法分析词基于开源项目改造实现主要是增加对于utf8文本的行数和列数的计算
### 快速开始
#### 安装
```
go get gitea.xintech.co/zhouzhihong/lexmachine
```
#### 使用列子
```
import (
"fmt"
"strings"
lex "gitea.xintech.co/zhouzhihong/lexmachine"
"gitea.xintech.co/zhouzhihong/lexmachine/machines"
)
var Literals []string // The tokens representing literal strings
var Keywords []string // The keyword tokens
var Tokens []string // All of the tokens (including literals and keywords)
var TokenIds map[string]int // A map from the token names to their int ids
var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner
// Called at package initialization. Creates the lexer and populates token lists.
func init() {
initTokens()
var err error
Lexer, err = initLexer()
if err != nil {
panic(err)
}
}
func initTokens() {
Literals = []string{
"[",
"]",
"{",
"}",
"=",
",",
";",
":",
"->",
"--",
}
Keywords = []string{
"NODE",
"EDGE",
"GRAPH",
"DIGRAPH",
"SUBGRAPH",
"STRICT",
}
Tokens = []string{
"COMMENT",
"ID",
}
Tokens = append(Tokens, Keywords...)
Tokens = append(Tokens, Literals...)
TokenIds = make(map[string]int)
for i, tok := range Tokens {
TokenIds[tok] = i
}
}
// Creates the lexer object and compiles the NFA.
func initLexer() (*lex.Lexer, error) {
lexer := lex.NewLexer()
for _, lit := range Literals {
r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
lexer.Add([]byte(r), token(lit))
}
for _, name := range Keywords {
lexer.Add([]byte(strings.ToLower(name)), token(name))
}
lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
})
lexer.Add([]byte("( |\t|\n|\r)+"), skip)
lexer.Add([]byte(`\<`),
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
str := make([]byte, 0, 10)
str = append(str, match.Bytes...)
brackets := 1
match.EndLine = match.StartLine
match.EndColumn = match.StartColumn
for tc := scan.TC; tc < len(scan.Text); tc++ {
str = append(str, scan.Text[tc])
match.EndColumn += 1
if scan.Text[tc] == '\n' {
match.EndLine += 1
}
if scan.Text[tc] == '<' {
brackets += 1
} else if scan.Text[tc] == '>' {
brackets -= 1
}
if brackets == 0 {
match.TC = scan.TC
scan.TC = tc + 1
match.Bytes = str
x, _ := token("ID")(scan, match)
t := x.(*lex.Token)
v := t.Value.(string)
t.Value = v[1 : len(v)-1]
return t, nil
}
}
return nil,
fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
match.TC, match.StartLine, match.StartColumn)
},
)
err := lexer.Compile()
if err != nil {
return nil, err
}
return lexer, nil
}
// a lex.Action function which skips the match.
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
return nil, nil
}
// a lex.Action function with constructs a Token of the given token type by
// the token type's name.
func token(name string) lex.Action {
return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
return s.Token(TokenIds[name], string(m.Bytes), m), nil
}
}
s, _ := dot.Lexer.Scanner([]byte(`
digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
strict // asdfa asfwe
/*你好*/ DIGRAPH // asdf`))
for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
fmt.Println(tok, err)
fmt.Printf("(%v,%v)-(%v,%v)\n",
tok.(*lexmachine.Token).TSLine,
tok.(*lexmachine.Token).TSColumn,
tok.(*lexmachine.Token).TELine,
tok.(*lexmachine.Token).TEColumn,
)
}
```
// TODO 优化位置标记计算

115
doc.go
View File

@ -1,116 +1 @@
// Package lexmachine is a full lexical analysis framework for the Go
// programming language. It supports a restricted but usable set of regular
// expressions appropriate for writing lexers for complex programming
// languages. The framework also supports sub-lexers and non-regular lexing
// through an "escape hatch" which allows the users to consume any number of
// further bytes after a match. So if you want to support nested C-style
// comments or other paired structures you can do so at the lexical analysis
// stage.
//
// For a tutorial see
// http://hackthology.com/writing-a-lexer-in-go-with-lexmachine.html
//
// Example of defining a lexer
//
// // CreateLexer defines a lexer for the graphviz dot language.
// func CreateLexer() (*lexmachine.Lexer, error) {
// lexer := lexmachine.NewLexer()
//
// for _, lit := range Literals {
// r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
// lexer.Add([]byte(r), token(lit))
// }
// for _, name := range Keywords {
// lexer.Add([]byte(strings.ToLower(name)), token(name))
// }
//
// lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
// lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
// lexer.Add([]byte(`([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*`), token("ID"))
// lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("ID"))
// lexer.Add([]byte("( |\t|\n|\r)+"), skip)
// lexer.Add([]byte(`\<`),
// func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) {
// str := make([]byte, 0, 10)
// str = append(str, match.Bytes...)
// brackets := 1
// match.EndLine = match.StartLine
// match.EndColumn = match.StartColumn
// for tc := scan.TC; tc < len(scan.Text); tc++ {
// str = append(str, scan.Text[tc])
// match.EndColumn += 1
// if scan.Text[tc] == '\n' {
// match.EndLine += 1
// }
// if scan.Text[tc] == '<' {
// brackets += 1
// } else if scan.Text[tc] == '>' {
// brackets -= 1
// }
// if brackets == 0 {
// match.TC = scan.TC
// scan.TC = tc + 1
// match.Bytes = str
// return token("ID")(scan, match)
// }
// }
// return nil,
// fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
// match.TC, match.StartLine, match.StartColumn)
// },
// )
//
// err := lexer.Compile()
// if err != nil {
// return nil, err
// }
// return lexer, nil
// }
//
// func token(name string) lex.Action {
// return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
// return s.Token(TokenIds[name], string(m.Bytes), m), nil
// }
// }
//
// Example of using a lexer
//
// func ExampleLex() error {
// lexer, err := CreateLexer()
// if err != nil {
// return err
// }
// scanner, err := lexer.Scanner([]byte(`digraph {
// rankdir=LR;
// a [label="a" shape=box];
// c [<label>=<<u>C</u>>];
// b [label="bb"];
// a -> c;
// c -> b;
// d -> c;
// b -> a;
// b -> e;
// e -> f;
// }`))
// if err != nil {
// return err
// }
// fmt.Println("Type | Lexeme | Position")
// fmt.Println("--------+------------+------------")
// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() {
// if err != nil {
// return err
// }
// token := tok.(*lexmachine.Token)
// fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n",
// dot.Tokens[token.Type],
// string(token.Lexeme),
// token.StartLine,
// token.StartColumn,
// token.EndLine,
// token.EndColumn)
// }
// return nil
// }
//
package lexmachine package lexmachine

View File

@ -53,7 +53,20 @@ func (t *Token) Equals(other *Token) bool {
// String formats the token in a human readable form. // String formats the token in a human readable form.
func (t *Token) String() string { func (t *Token) String() string {
return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn) return fmt.Sprintf(
"%d %q %d (%d, %d)-(%d, %d) (%d, %d)-(%d, %d)",
t.Type,
t.Value,
t.TC,
t.StartLine,
t.StartColumn,
t.EndLine,
t.EndColumn,
t.TSLine,
t.TSColumn,
t.TELine,
t.TEColumn,
)
} }
// An Action is a function which get called when the Scanner finds a match // An Action is a function which get called when the Scanner finds a match