Compare commits
No commits in common. "main" and "v0.9.2" have entirely different histories.
166
README.md
166
README.md
@ -1,166 +0,0 @@
|
||||
## Lexmachine
|
||||
词法分析词,基于开源项目改造实现,主要是增加对于utf8文本的行数和列数的计算
|
||||
|
||||
### 快速开始
|
||||
|
||||
#### 安装
|
||||
```
|
||||
go get gitea.xintech.co/zhouzhihong/lexmachine
|
||||
```
|
||||
|
||||
#### 使用列子
|
||||
```
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
lex "gitea.xintech.co/zhouzhihong/lexmachine"
|
||||
"gitea.xintech.co/zhouzhihong/lexmachine/machines"
|
||||
)
|
||||
|
||||
var Literals []string // The tokens representing literal strings
|
||||
var Keywords []string // The keyword tokens
|
||||
var Tokens []string // All of the tokens (including literals and keywords)
|
||||
var TokenIds map[string]int // A map from the token names to their int ids
|
||||
var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner
|
||||
|
||||
// Called at package initialization. Creates the lexer and populates token lists.
|
||||
func init() {
|
||||
initTokens()
|
||||
var err error
|
||||
Lexer, err = initLexer()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
func initTokens() {
|
||||
Literals = []string{
|
||||
"[",
|
||||
"]",
|
||||
"{",
|
||||
"}",
|
||||
"=",
|
||||
",",
|
||||
";",
|
||||
":",
|
||||
"->",
|
||||
"--",
|
||||
}
|
||||
Keywords = []string{
|
||||
"NODE",
|
||||
"EDGE",
|
||||
"GRAPH",
|
||||
"DIGRAPH",
|
||||
"SUBGRAPH",
|
||||
"STRICT",
|
||||
}
|
||||
Tokens = []string{
|
||||
"COMMENT",
|
||||
"ID",
|
||||
}
|
||||
Tokens = append(Tokens, Keywords...)
|
||||
Tokens = append(Tokens, Literals...)
|
||||
TokenIds = make(map[string]int)
|
||||
for i, tok := range Tokens {
|
||||
TokenIds[tok] = i
|
||||
}
|
||||
}
|
||||
|
||||
// Creates the lexer object and compiles the NFA.
|
||||
func initLexer() (*lex.Lexer, error) {
|
||||
lexer := lex.NewLexer()
|
||||
|
||||
for _, lit := range Literals {
|
||||
r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
|
||||
lexer.Add([]byte(r), token(lit))
|
||||
}
|
||||
for _, name := range Keywords {
|
||||
lexer.Add([]byte(strings.ToLower(name)), token(name))
|
||||
}
|
||||
|
||||
lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
|
||||
lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
|
||||
lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
|
||||
lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
|
||||
lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
|
||||
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
|
||||
x, _ := token("ID")(scan, match)
|
||||
t := x.(*lex.Token)
|
||||
v := t.Value.(string)
|
||||
t.Value = v[1 : len(v)-1]
|
||||
return t, nil
|
||||
})
|
||||
lexer.Add([]byte("( |\t|\n|\r)+"), skip)
|
||||
lexer.Add([]byte(`\<`),
|
||||
func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
|
||||
str := make([]byte, 0, 10)
|
||||
str = append(str, match.Bytes...)
|
||||
brackets := 1
|
||||
match.EndLine = match.StartLine
|
||||
match.EndColumn = match.StartColumn
|
||||
for tc := scan.TC; tc < len(scan.Text); tc++ {
|
||||
str = append(str, scan.Text[tc])
|
||||
match.EndColumn += 1
|
||||
if scan.Text[tc] == '\n' {
|
||||
match.EndLine += 1
|
||||
}
|
||||
if scan.Text[tc] == '<' {
|
||||
brackets += 1
|
||||
} else if scan.Text[tc] == '>' {
|
||||
brackets -= 1
|
||||
}
|
||||
if brackets == 0 {
|
||||
match.TC = scan.TC
|
||||
scan.TC = tc + 1
|
||||
match.Bytes = str
|
||||
x, _ := token("ID")(scan, match)
|
||||
t := x.(*lex.Token)
|
||||
v := t.Value.(string)
|
||||
t.Value = v[1 : len(v)-1]
|
||||
return t, nil
|
||||
}
|
||||
}
|
||||
return nil,
|
||||
fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
|
||||
match.TC, match.StartLine, match.StartColumn)
|
||||
},
|
||||
)
|
||||
|
||||
err := lexer.Compile()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return lexer, nil
|
||||
}
|
||||
|
||||
// a lex.Action function which skips the match.
|
||||
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// a lex.Action function with constructs a Token of the given token type by
|
||||
// the token type's name.
|
||||
func token(name string) lex.Action {
|
||||
return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
|
||||
return s.Token(TokenIds[name], string(m.Bytes), m), nil
|
||||
}
|
||||
}
|
||||
|
||||
s, _ := dot.Lexer.Scanner([]byte(`
|
||||
digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
|
||||
strict // asdfa asfwe
|
||||
/*你好*/ DIGRAPH // asdf`))
|
||||
|
||||
for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
|
||||
fmt.Println(tok, err)
|
||||
fmt.Printf("(%v,%v)-(%v,%v)\n",
|
||||
tok.(*lexmachine.Token).TSLine,
|
||||
tok.(*lexmachine.Token).TSColumn,
|
||||
tok.(*lexmachine.Token).TELine,
|
||||
tok.(*lexmachine.Token).TEColumn,
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
//TODO 错误信息的行列计算
|
115
doc.go
115
doc.go
@ -1 +1,116 @@
|
||||
// Package lexmachine is a full lexical analysis framework for the Go
|
||||
// programming language. It supports a restricted but usable set of regular
|
||||
// expressions appropriate for writing lexers for complex programming
|
||||
// languages. The framework also supports sub-lexers and non-regular lexing
|
||||
// through an "escape hatch" which allows the users to consume any number of
|
||||
// further bytes after a match. So if you want to support nested C-style
|
||||
// comments or other paired structures you can do so at the lexical analysis
|
||||
// stage.
|
||||
//
|
||||
// For a tutorial see
|
||||
// http://hackthology.com/writing-a-lexer-in-go-with-lexmachine.html
|
||||
//
|
||||
// Example of defining a lexer
|
||||
//
|
||||
// // CreateLexer defines a lexer for the graphviz dot language.
|
||||
// func CreateLexer() (*lexmachine.Lexer, error) {
|
||||
// lexer := lexmachine.NewLexer()
|
||||
//
|
||||
// for _, lit := range Literals {
|
||||
// r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
|
||||
// lexer.Add([]byte(r), token(lit))
|
||||
// }
|
||||
// for _, name := range Keywords {
|
||||
// lexer.Add([]byte(strings.ToLower(name)), token(name))
|
||||
// }
|
||||
//
|
||||
// lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
|
||||
// lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
|
||||
// lexer.Add([]byte(`([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*`), token("ID"))
|
||||
// lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("ID"))
|
||||
// lexer.Add([]byte("( |\t|\n|\r)+"), skip)
|
||||
// lexer.Add([]byte(`\<`),
|
||||
// func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) {
|
||||
// str := make([]byte, 0, 10)
|
||||
// str = append(str, match.Bytes...)
|
||||
// brackets := 1
|
||||
// match.EndLine = match.StartLine
|
||||
// match.EndColumn = match.StartColumn
|
||||
// for tc := scan.TC; tc < len(scan.Text); tc++ {
|
||||
// str = append(str, scan.Text[tc])
|
||||
// match.EndColumn += 1
|
||||
// if scan.Text[tc] == '\n' {
|
||||
// match.EndLine += 1
|
||||
// }
|
||||
// if scan.Text[tc] == '<' {
|
||||
// brackets += 1
|
||||
// } else if scan.Text[tc] == '>' {
|
||||
// brackets -= 1
|
||||
// }
|
||||
// if brackets == 0 {
|
||||
// match.TC = scan.TC
|
||||
// scan.TC = tc + 1
|
||||
// match.Bytes = str
|
||||
// return token("ID")(scan, match)
|
||||
// }
|
||||
// }
|
||||
// return nil,
|
||||
// fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
|
||||
// match.TC, match.StartLine, match.StartColumn)
|
||||
// },
|
||||
// )
|
||||
//
|
||||
// err := lexer.Compile()
|
||||
// if err != nil {
|
||||
// return nil, err
|
||||
// }
|
||||
// return lexer, nil
|
||||
// }
|
||||
//
|
||||
// func token(name string) lex.Action {
|
||||
// return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
|
||||
// return s.Token(TokenIds[name], string(m.Bytes), m), nil
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// Example of using a lexer
|
||||
//
|
||||
// func ExampleLex() error {
|
||||
// lexer, err := CreateLexer()
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// scanner, err := lexer.Scanner([]byte(`digraph {
|
||||
// rankdir=LR;
|
||||
// a [label="a" shape=box];
|
||||
// c [<label>=<<u>C</u>>];
|
||||
// b [label="bb"];
|
||||
// a -> c;
|
||||
// c -> b;
|
||||
// d -> c;
|
||||
// b -> a;
|
||||
// b -> e;
|
||||
// e -> f;
|
||||
// }`))
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// fmt.Println("Type | Lexeme | Position")
|
||||
// fmt.Println("--------+------------+------------")
|
||||
// for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() {
|
||||
// if err != nil {
|
||||
// return err
|
||||
// }
|
||||
// token := tok.(*lexmachine.Token)
|
||||
// fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n",
|
||||
// dot.Tokens[token.Type],
|
||||
// string(token.Lexeme),
|
||||
// token.StartLine,
|
||||
// token.StartColumn,
|
||||
// token.EndLine,
|
||||
// token.EndColumn)
|
||||
// }
|
||||
// return nil
|
||||
// }
|
||||
//
|
||||
package lexmachine
|
||||
|
@ -23,7 +23,10 @@ func TestParse(x *testing.T) {
|
||||
}
|
||||
|
||||
func tMatch(program inst.Slice, text string, t *test.T) {
|
||||
expected := []machines.Match{{PC: len(program) - 1, TC: 0, Bytes: []byte(text)}}
|
||||
expected := []machines.Match{{PC: len(program) - 1, TC: 0, StartLine: 1, StartColumn: 1, EndLine: 1, EndColumn: len(text), Bytes: []byte(text), TSLine: 1, TSColumn: 0, TELine: 1, TEColumn: 1}}
|
||||
if expected[0].EndColumn == 0 {
|
||||
expected[0].EndColumn = 1
|
||||
}
|
||||
i := 0
|
||||
scan := machines.LexerEngine(program, []byte(text))
|
||||
for tc, m, err, scan := scan(0); scan != nil; tc, m, err, scan = scan(tc) {
|
||||
|
108
lexer.go
108
lexer.go
@ -3,7 +3,6 @@ package lexmachine
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"unicode/utf8"
|
||||
|
||||
dfapkg "gitea.xintech.co/zhouzhihong/lexmachine/dfa"
|
||||
@ -54,16 +53,7 @@ func (t *Token) Equals(other *Token) bool {
|
||||
|
||||
// String formats the token in a human readable form.
|
||||
func (t *Token) String() string {
|
||||
return fmt.Sprintf(
|
||||
"%d %q %d (%d, %d)-(%d, %d)",
|
||||
t.Type,
|
||||
t.Value,
|
||||
t.TC,
|
||||
t.StartLine,
|
||||
t.StartColumn,
|
||||
t.EndLine,
|
||||
t.EndColumn,
|
||||
)
|
||||
return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn)
|
||||
}
|
||||
|
||||
// An Action is a function which get called when the Scanner finds a match
|
||||
@ -119,7 +109,12 @@ type Scanner struct {
|
||||
Text []byte
|
||||
TC int
|
||||
pTC int
|
||||
lpp map[int]lastPostion
|
||||
sLine int
|
||||
sColumn int
|
||||
eLine int
|
||||
eColumn int
|
||||
|
||||
lpp map[int]lastPostion
|
||||
}
|
||||
|
||||
type lastPostion struct {
|
||||
@ -151,61 +146,74 @@ type lastPostion struct {
|
||||
func (s *Scanner) Next() (tok interface{}, err error, eos bool) {
|
||||
var token interface{}
|
||||
for token == nil {
|
||||
ntc, match, err, scan := s.scan(s.TC)
|
||||
tc, match, err, scan := s.scan(s.TC)
|
||||
if scan == nil {
|
||||
return nil, nil, true
|
||||
} else if err != nil {
|
||||
return nil, err, false
|
||||
} else if match == nil {
|
||||
return nil, fmt.Errorf("no match but no error"), false
|
||||
return nil, fmt.Errorf("No match but no error"), false
|
||||
}
|
||||
s.scan = scan
|
||||
s.pTC = s.TC
|
||||
s.TC = ntc
|
||||
s.TC = tc
|
||||
s.sLine = match.StartLine
|
||||
s.sColumn = match.StartColumn
|
||||
s.eLine = match.EndLine
|
||||
s.eColumn = match.EndColumn
|
||||
|
||||
p := s.pTC
|
||||
l, c := s.lpp[p].l, s.lpp[p].c
|
||||
stc := s.TC - len(match.Bytes)
|
||||
|
||||
for {
|
||||
if s.Text[p] == '\n' {
|
||||
l++
|
||||
c = 0
|
||||
} else {
|
||||
c++
|
||||
}
|
||||
|
||||
if p == stc {
|
||||
match.TSLine = l
|
||||
match.TSColumn = c
|
||||
}
|
||||
|
||||
match.TELine = l
|
||||
match.TEColumn = c
|
||||
|
||||
_, sz := utf8.DecodeRune(s.Text[p:])
|
||||
p += sz
|
||||
if p >= s.TC {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
s.lpp[s.TC] = lastPostion{
|
||||
l: l,
|
||||
c: c,
|
||||
}
|
||||
|
||||
pattern := s.lexer.patterns[s.matches[match.PC]]
|
||||
token, err = pattern.action(s, match)
|
||||
|
||||
lpp := s.lpp[s.pTC]
|
||||
line, col := lpp.l, lpp.c
|
||||
for i := s.pTC; i < s.TC; {
|
||||
ch, sz := utf8.DecodeRune(s.Text[i:])
|
||||
if ch == rune('\n') {
|
||||
line++
|
||||
col = 0
|
||||
} else {
|
||||
col++
|
||||
}
|
||||
|
||||
if (i == match.TC) && reflect.TypeOf(token) == reflect.TypeOf(&Token{}) {
|
||||
token.(*Token).StartLine = line
|
||||
token.(*Token).StartColumn = col
|
||||
}
|
||||
i += sz
|
||||
}
|
||||
|
||||
if reflect.TypeOf(token) == reflect.TypeOf(&Token{}) {
|
||||
token.(*Token).EndLine = line
|
||||
token.(*Token).EndColumn = col
|
||||
}
|
||||
|
||||
s.lpp[s.TC] = lastPostion{l: line, c: col}
|
||||
|
||||
if err != nil {
|
||||
return nil, err, false
|
||||
}
|
||||
}
|
||||
|
||||
return token, nil, false
|
||||
}
|
||||
|
||||
// Token is a helper function for constructing a Token type inside of a Action.
|
||||
func (s *Scanner) Token(typ int, value interface{}, m *machines.Match) *Token {
|
||||
return &Token{
|
||||
Type: typ,
|
||||
Value: value,
|
||||
Lexeme: m.Bytes,
|
||||
TC: m.TC,
|
||||
Type: typ,
|
||||
Value: value,
|
||||
Lexeme: m.Bytes,
|
||||
TC: m.TC,
|
||||
StartLine: m.StartLine,
|
||||
StartColumn: m.StartColumn,
|
||||
EndLine: m.EndLine,
|
||||
EndColumn: m.EndColumn,
|
||||
}
|
||||
}
|
||||
|
||||
@ -295,7 +303,7 @@ func (l *Lexer) assembleAST() (frontend.AST, error) {
|
||||
// only created explicitly) this will be used by Scanners when they are created.
|
||||
func (l *Lexer) CompileNFA() error {
|
||||
if len(l.patterns) == 0 {
|
||||
return fmt.Errorf("no patterns added")
|
||||
return fmt.Errorf("No patterns added")
|
||||
}
|
||||
if l.program != nil {
|
||||
return nil
|
||||
@ -325,7 +333,7 @@ func (l *Lexer) CompileNFA() error {
|
||||
} else if mes {
|
||||
l.program = nil
|
||||
l.nfaMatches = nil
|
||||
return fmt.Errorf("one or more of the supplied patterns match the empty string")
|
||||
return fmt.Errorf("One or more of the supplied patterns match the empty string")
|
||||
}
|
||||
|
||||
return nil
|
||||
@ -335,7 +343,7 @@ func (l *Lexer) CompileNFA() error {
|
||||
// they are created.
|
||||
func (l *Lexer) CompileDFA() error {
|
||||
if len(l.patterns) == 0 {
|
||||
return fmt.Errorf("no patterns added")
|
||||
return fmt.Errorf("No patterns added")
|
||||
}
|
||||
if l.dfa != nil {
|
||||
return nil
|
||||
@ -355,7 +363,7 @@ func (l *Lexer) CompileDFA() error {
|
||||
} else if mes {
|
||||
l.dfa = nil
|
||||
l.dfaMatches = nil
|
||||
return fmt.Errorf("one or more of the supplied patterns match the empty string")
|
||||
return fmt.Errorf("One or more of the supplied patterns match the empty string")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
@ -77,8 +77,8 @@ func TestSimple(x *testing.T) {
|
||||
}
|
||||
}
|
||||
return nil,
|
||||
fmt.Errorf("unclosed comment starting at %d",
|
||||
match.TC)
|
||||
fmt.Errorf("unclosed comment starting at %d, (%d, %d)",
|
||||
match.TC, match.StartLine, match.StartColumn)
|
||||
},
|
||||
)
|
||||
|
||||
@ -111,6 +111,7 @@ func TestSimple(x *testing.T) {
|
||||
{PRINT, nil, []byte("print"), 129, 10, 3, 10, 7},
|
||||
{NAME, "printname", []byte("printname"), 135, 10, 9, 10, 17},
|
||||
}
|
||||
|
||||
scan := func(lexer *Lexer) {
|
||||
scanner, err := lexer.Scanner(text)
|
||||
if err != nil {
|
||||
@ -291,7 +292,6 @@ func TestRegression(t *testing.T) {
|
||||
fmt.Printf("Token: %v\n", tok)
|
||||
found++
|
||||
}
|
||||
|
||||
if found != test.tokens {
|
||||
t.Errorf("Expected exactly %v tokens got %v, ===\nErr: %v\nEOS: %v\nTC: %d\n", test.tokens, found, err, eos, scanner.TC)
|
||||
}
|
||||
|
@ -34,6 +34,7 @@ func mapLineCols(text []byte) []lineCol {
|
||||
// DFA state machine. If the lexing process fails the Scanner will return
|
||||
// an UnconsumedInput error.
|
||||
func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAccepting, text []byte) Scanner {
|
||||
lineCols := mapLineCols(text)
|
||||
done := false
|
||||
matchID := -1
|
||||
matchTC := -1
|
||||
@ -61,15 +62,23 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
|
||||
}
|
||||
state = trans[state][text[tc]]
|
||||
if state == errorState && matchID > -1 {
|
||||
startLC := lineCols[startTC]
|
||||
endLC := lineCols[matchTC-1]
|
||||
match := &Match{
|
||||
PC: matchID,
|
||||
TC: startTC,
|
||||
Bytes: text[startTC:matchTC],
|
||||
PC: matchID,
|
||||
TC: startTC,
|
||||
StartLine: startLC.line,
|
||||
StartColumn: startLC.col,
|
||||
EndLine: endLC.line,
|
||||
EndColumn: endLC.col,
|
||||
Bytes: text[startTC:matchTC],
|
||||
}
|
||||
if matchTC == startTC {
|
||||
err := &EmptyMatchError{
|
||||
MatchID: matchID,
|
||||
TC: startTC,
|
||||
TC: tc,
|
||||
Line: startLC.line,
|
||||
Column: startLC.col,
|
||||
}
|
||||
return startTC, nil, err, scan
|
||||
}
|
||||
@ -82,19 +91,30 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
|
||||
matchTC = tc
|
||||
}
|
||||
if startTC <= len(text) && matchID > -1 && matchTC == startTC {
|
||||
|
||||
var startLC lineCol
|
||||
if startTC < len(text) {
|
||||
startLC = lineCols[startTC]
|
||||
}
|
||||
err := &EmptyMatchError{
|
||||
MatchID: matchID,
|
||||
TC: startTC,
|
||||
TC: tc,
|
||||
Line: startLC.line,
|
||||
Column: startLC.col,
|
||||
}
|
||||
matchID = -1
|
||||
return startTC, nil, err, scan
|
||||
}
|
||||
if startTC < len(text) && matchTC <= len(text) && matchID > -1 {
|
||||
startLC := lineCols[startTC]
|
||||
endLC := lineCols[matchTC-1]
|
||||
match := &Match{
|
||||
PC: matchID,
|
||||
TC: startTC,
|
||||
Bytes: text[startTC:matchTC],
|
||||
PC: matchID,
|
||||
TC: startTC,
|
||||
StartLine: startLC.line,
|
||||
StartColumn: startLC.col,
|
||||
EndLine: endLC.line,
|
||||
EndColumn: endLC.col,
|
||||
Bytes: text[startTC:matchTC],
|
||||
}
|
||||
matchID = -1
|
||||
return matchTC, match, nil, scan
|
||||
@ -108,12 +128,22 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
|
||||
if matchTC == -1 {
|
||||
matchTC = 0
|
||||
}
|
||||
startLC := lineCols[startTC]
|
||||
etc := tc
|
||||
|
||||
var endLC lineCol
|
||||
if etc >= len(lineCols) {
|
||||
endLC = lineCols[len(lineCols)-1]
|
||||
} else {
|
||||
endLC = lineCols[etc]
|
||||
}
|
||||
err := &UnconsumedInput{
|
||||
StartTC: startTC,
|
||||
FailTC: etc,
|
||||
Text: text,
|
||||
StartTC: startTC,
|
||||
FailTC: etc,
|
||||
StartLine: startLC.line,
|
||||
StartColumn: startLC.col,
|
||||
FailLine: endLC.line,
|
||||
FailColumn: endLC.col,
|
||||
Text: text,
|
||||
}
|
||||
return tc, nil, err, scan
|
||||
} else {
|
||||
|
@ -13,20 +13,26 @@ import (
|
||||
// string
|
||||
type EmptyMatchError struct {
|
||||
TC int
|
||||
Line int
|
||||
Column int
|
||||
MatchID int
|
||||
}
|
||||
|
||||
func (e *EmptyMatchError) Error() string {
|
||||
return fmt.Sprintf("Lexer error: matched the empty string at %d for match id %d.",
|
||||
e.TC, e.MatchID,
|
||||
return fmt.Sprintf("Lexer error: matched the empty string at %d:%d (tc=%d) for match id %d.",
|
||||
e.Line, e.Column, e.TC, e.MatchID,
|
||||
)
|
||||
}
|
||||
|
||||
// UnconsumedInput error type
|
||||
type UnconsumedInput struct {
|
||||
StartTC int
|
||||
FailTC int
|
||||
Text []byte
|
||||
StartTC int
|
||||
FailTC int
|
||||
StartLine int
|
||||
StartColumn int
|
||||
FailLine int
|
||||
FailColumn int
|
||||
Text []byte
|
||||
}
|
||||
|
||||
// Error implements the error interface
|
||||
@ -45,18 +51,60 @@ func (u *UnconsumedInput) Error() string {
|
||||
}
|
||||
stc := min(u.StartTC, len(u.Text)-1)
|
||||
etc := min(max(u.StartTC+1, u.FailTC), len(u.Text))
|
||||
return fmt.Sprintf("Lexer error: could not match text starting at %v failing at %v.\n\tunmatched text: %q",
|
||||
u.StartTC,
|
||||
u.FailTC,
|
||||
return fmt.Sprintf("Lexer error: could not match text starting at %v:%v failing at %v:%v.\n\tunmatched text: %q",
|
||||
u.StartLine, u.StartColumn,
|
||||
u.FailLine, u.FailColumn,
|
||||
string(u.Text[stc:etc]),
|
||||
)
|
||||
}
|
||||
|
||||
// A Match represents the positional and textual information from a match.
|
||||
type Match struct {
|
||||
PC int // pattern cursor
|
||||
TC int // start position of text cursor
|
||||
Bytes []byte // the actual bytes matched during scanning.
|
||||
PC int
|
||||
TC int
|
||||
StartLine int
|
||||
StartColumn int
|
||||
EndLine int
|
||||
EndColumn int
|
||||
Bytes []byte // the actual bytes matched during scanning.
|
||||
|
||||
TSLine, TSColumn, TELine, TEColumn int
|
||||
}
|
||||
|
||||
func computeLineCol(text []byte, prevTC, tc, line, col int) (int, int) {
|
||||
if tc < 0 {
|
||||
return line, col
|
||||
}
|
||||
if tc < prevTC {
|
||||
for i := prevTC; i > tc && i > 0; i-- {
|
||||
if text[i] == '\n' {
|
||||
line--
|
||||
}
|
||||
}
|
||||
col = 0
|
||||
for i := tc; i >= 0; i-- {
|
||||
if text[i] == '\n' {
|
||||
break
|
||||
}
|
||||
col++
|
||||
}
|
||||
return line, col
|
||||
}
|
||||
for i := prevTC + 1; i <= tc && i < len(text); i++ {
|
||||
if text[i] == '\n' {
|
||||
col = 0
|
||||
line++
|
||||
} else {
|
||||
col++
|
||||
}
|
||||
}
|
||||
if prevTC == tc && tc == 0 && tc < len(text) {
|
||||
if text[tc] == '\n' {
|
||||
line++
|
||||
col--
|
||||
}
|
||||
}
|
||||
return line, col
|
||||
}
|
||||
|
||||
// Equals checks two matches for equality
|
||||
@ -69,13 +117,16 @@ func (m *Match) Equals(other *Match) bool {
|
||||
return false
|
||||
}
|
||||
return m.PC == other.PC &&
|
||||
m.TC == other.TC &&
|
||||
m.StartLine == other.StartLine &&
|
||||
m.StartColumn == other.StartColumn &&
|
||||
m.EndLine == other.EndLine &&
|
||||
m.EndColumn == other.EndColumn &&
|
||||
bytes.Equal(m.Bytes, other.Bytes)
|
||||
}
|
||||
|
||||
// String formats the match for humans
|
||||
func (m Match) String() string {
|
||||
return fmt.Sprintf("<Match %d %d %v'>", m.PC, m.TC, string(m.Bytes))
|
||||
return fmt.Sprintf("<Match %d %d (%d, %d)-(%d, %d) '%v'>", m.PC, m.TC, m.StartLine, m.StartColumn, m.EndLine, m.EndColumn, string(m.Bytes))
|
||||
}
|
||||
|
||||
// Scanner is a functional iterator returned by the LexerEngine. See
|
||||
@ -90,6 +141,10 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
|
||||
matchPC := -1
|
||||
matchTC := -1
|
||||
|
||||
prevTC := 0
|
||||
line := 1
|
||||
col := 1
|
||||
|
||||
var scan Scanner
|
||||
var cqueue, nqueue *queue.Queue = queue.New(len(program)), queue.New(len(program))
|
||||
scan = func(tc int) (int, *Match, error, Scanner) {
|
||||
@ -142,19 +197,27 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
|
||||
}
|
||||
cqueue, nqueue = nqueue, cqueue
|
||||
if cqueue.Empty() && matchPC > -1 {
|
||||
|
||||
line, col = computeLineCol(text, prevTC, startTC, line, col)
|
||||
eLine, eCol := computeLineCol(text, startTC, matchTC-1, line, col)
|
||||
match := &Match{
|
||||
PC: matchPC,
|
||||
TC: startTC,
|
||||
Bytes: text[startTC:matchTC],
|
||||
PC: matchPC,
|
||||
TC: startTC,
|
||||
StartLine: line,
|
||||
StartColumn: col,
|
||||
EndLine: eLine,
|
||||
EndColumn: eCol,
|
||||
Bytes: text[startTC:matchTC],
|
||||
}
|
||||
if matchTC == startTC {
|
||||
err := &EmptyMatchError{
|
||||
MatchID: matchPC,
|
||||
TC: tc,
|
||||
Line: line,
|
||||
Column: col,
|
||||
}
|
||||
return startTC, nil, err, scan
|
||||
}
|
||||
prevTC = startTC
|
||||
matchPC = -1
|
||||
return matchTC, match, nil, scan
|
||||
}
|
||||
@ -168,10 +231,16 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
|
||||
if matchTC == -1 {
|
||||
matchTC = 0
|
||||
}
|
||||
sline, scol := computeLineCol(text, 0, startTC, 1, 1)
|
||||
fline, fcol := computeLineCol(text, 0, tc, 1, 1)
|
||||
err := &UnconsumedInput{
|
||||
StartTC: startTC,
|
||||
FailTC: tc,
|
||||
Text: text,
|
||||
StartTC: startTC,
|
||||
FailTC: tc,
|
||||
StartLine: sline,
|
||||
StartColumn: scol,
|
||||
FailLine: fline,
|
||||
FailColumn: fcol,
|
||||
Text: text,
|
||||
}
|
||||
return tc, nil, err, scan
|
||||
} else {
|
||||
|
@ -34,7 +34,7 @@ func TestLexerMatch(t *testing.T) {
|
||||
t.Log(program)
|
||||
mtext := []byte("ababcbcbb")
|
||||
expected := []Match{
|
||||
{16, 0, mtext},
|
||||
{16, 0, 1, 1, 1, len(mtext), mtext, 1, 0, 1, 1},
|
||||
}
|
||||
i := 0
|
||||
for tc, m, err, scan := LexerEngine(program, text)(0); scan != nil; tc, m, err, scan = scan(tc) {
|
||||
@ -114,9 +114,9 @@ func TestLexerThreeStrings(t *testing.T) {
|
||||
t.Log(len(text))
|
||||
t.Log(program)
|
||||
expected := []Match{
|
||||
{8, 0, []byte("struct")},
|
||||
{13, 6, []byte(" ")},
|
||||
{15, 8, []byte("*")},
|
||||
{8, 0, 1, 1, 1, 6, []byte("struct"), 1, 0, 1, 1},
|
||||
{13, 6, 1, 7, 1, 8, []byte(" "), 1, 0, 1, 1},
|
||||
{15, 8, 1, 9, 1, 9, []byte("*"), 1, 0, 1, 1},
|
||||
}
|
||||
|
||||
i := 0
|
||||
@ -165,9 +165,9 @@ func TestLexerRestart(t *testing.T) {
|
||||
t.Log(len(text))
|
||||
t.Log(program)
|
||||
expected := []Match{
|
||||
{8, 0, []byte("struct")},
|
||||
{19, 6, []byte("\n ")},
|
||||
{21, 9, []byte("*")},
|
||||
{8, 0, 1, 1, 1, 6, []byte("struct"), 1, 0, 1, 1},
|
||||
{19, 6, 2, 0, 2, 2, []byte("\n "), 1, 0, 1, 1},
|
||||
{21, 9, 2, 3, 2, 3, []byte("*"), 1, 0, 1, 1},
|
||||
}
|
||||
|
||||
check := func(m *Match, i int, err error) {
|
||||
|
Loading…
Reference in New Issue
Block a user