Compare commits

..

No commits in common. "main" and "dev" have entirely different histories.
main ... dev

7 changed files with 225 additions and 96 deletions

View File

@ -163,4 +163,4 @@ for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
} }
``` ```
//TODO 错误信息的行列计算 // TODO 优化位置标记计算

View File

@ -23,7 +23,10 @@ func TestParse(x *testing.T) {
} }
func tMatch(program inst.Slice, text string, t *test.T) { func tMatch(program inst.Slice, text string, t *test.T) {
expected := []machines.Match{{PC: len(program) - 1, TC: 0, Bytes: []byte(text)}} expected := []machines.Match{{PC: len(program) - 1, TC: 0, StartLine: 1, StartColumn: 1, EndLine: 1, EndColumn: len(text), Bytes: []byte(text), TSLine: 1, TSColumn: 0, TELine: 1, TEColumn: 1}}
if expected[0].EndColumn == 0 {
expected[0].EndColumn = 1
}
i := 0 i := 0
scan := machines.LexerEngine(program, []byte(text)) scan := machines.LexerEngine(program, []byte(text))
for tc, m, err, scan := scan(0); scan != nil; tc, m, err, scan = scan(tc) { for tc, m, err, scan := scan(0); scan != nil; tc, m, err, scan = scan(tc) {

View File

@ -3,7 +3,6 @@ package lexmachine
import ( import (
"bytes" "bytes"
"fmt" "fmt"
"reflect"
"unicode/utf8" "unicode/utf8"
dfapkg "gitea.xintech.co/zhouzhihong/lexmachine/dfa" dfapkg "gitea.xintech.co/zhouzhihong/lexmachine/dfa"
@ -32,6 +31,8 @@ type Token struct {
StartColumn int StartColumn int
EndLine int EndLine int
EndColumn int EndColumn int
TSLine, TSColumn, TELine, TEColumn int
} }
// Equals checks the equality of two tokens ignoring the Value field. // Equals checks the equality of two tokens ignoring the Value field.
@ -55,7 +56,7 @@ func (t *Token) Equals(other *Token) bool {
// String formats the token in a human readable form. // String formats the token in a human readable form.
func (t *Token) String() string { func (t *Token) String() string {
return fmt.Sprintf( return fmt.Sprintf(
"%d %q %d (%d, %d)-(%d, %d)", "%d %q %d (%d, %d)-(%d, %d) (%d, %d)-(%d, %d)",
t.Type, t.Type,
t.Value, t.Value,
t.TC, t.TC,
@ -63,6 +64,10 @@ func (t *Token) String() string {
t.StartColumn, t.StartColumn,
t.EndLine, t.EndLine,
t.EndColumn, t.EndColumn,
t.TSLine,
t.TSColumn,
t.TELine,
t.TEColumn,
) )
} }
@ -119,7 +124,12 @@ type Scanner struct {
Text []byte Text []byte
TC int TC int
pTC int pTC int
lpp map[int]lastPostion sLine int
sColumn int
eLine int
eColumn int
lpp map[int]lastPostion
} }
type lastPostion struct { type lastPostion struct {
@ -151,7 +161,7 @@ type lastPostion struct {
func (s *Scanner) Next() (tok interface{}, err error, eos bool) { func (s *Scanner) Next() (tok interface{}, err error, eos bool) {
var token interface{} var token interface{}
for token == nil { for token == nil {
ntc, match, err, scan := s.scan(s.TC) tc, match, err, scan := s.scan(s.TC)
if scan == nil { if scan == nil {
return nil, nil, true return nil, nil, true
} else if err != nil { } else if err != nil {
@ -161,51 +171,68 @@ func (s *Scanner) Next() (tok interface{}, err error, eos bool) {
} }
s.scan = scan s.scan = scan
s.pTC = s.TC s.pTC = s.TC
s.TC = ntc s.TC = tc
s.sLine = match.StartLine
s.sColumn = match.StartColumn
s.eLine = match.EndLine
s.eColumn = match.EndColumn
p := s.pTC
l, c := s.lpp[p].l, s.lpp[p].c
stc := s.TC - len(match.Bytes)
for {
if s.Text[p] == '\n' {
l++
c = 0
} else {
c++
}
if p == stc {
match.TSLine = l
match.TSColumn = c
}
match.TELine = l
match.TEColumn = c
_, sz := utf8.DecodeRune(s.Text[p:])
p += sz
if p >= s.TC {
break
}
}
s.lpp[s.TC] = lastPostion{
l: l,
c: c,
}
pattern := s.lexer.patterns[s.matches[match.PC]] pattern := s.lexer.patterns[s.matches[match.PC]]
token, err = pattern.action(s, match) token, err = pattern.action(s, match)
lpp := s.lpp[s.pTC]
line, col := lpp.l, lpp.c
for i := s.pTC; i < s.TC; {
ch, sz := utf8.DecodeRune(s.Text[i:])
if ch == rune('\n') {
line++
col = 0
} else {
col++
}
if (i == match.TC) && reflect.TypeOf(token) == reflect.TypeOf(&Token{}) {
token.(*Token).StartLine = line
token.(*Token).StartColumn = col
}
i += sz
}
if reflect.TypeOf(token) == reflect.TypeOf(&Token{}) {
token.(*Token).EndLine = line
token.(*Token).EndColumn = col
}
s.lpp[s.TC] = lastPostion{l: line, c: col}
if err != nil { if err != nil {
return nil, err, false return nil, err, false
} }
} }
return token, nil, false return token, nil, false
} }
// Token is a helper function for constructing a Token type inside of a Action. // Token is a helper function for constructing a Token type inside of a Action.
func (s *Scanner) Token(typ int, value interface{}, m *machines.Match) *Token { func (s *Scanner) Token(typ int, value interface{}, m *machines.Match) *Token {
return &Token{ return &Token{
Type: typ, Type: typ,
Value: value, Value: value,
Lexeme: m.Bytes, Lexeme: m.Bytes,
TC: m.TC, TC: m.TC,
StartLine: m.StartLine,
StartColumn: m.StartColumn,
EndLine: m.EndLine,
EndColumn: m.EndColumn,
TSLine: m.TSLine,
TSColumn: m.TSColumn,
TELine: m.TELine,
TEColumn: m.TEColumn,
} }
} }

View File

@ -77,8 +77,8 @@ func TestSimple(x *testing.T) {
} }
} }
return nil, return nil,
fmt.Errorf("unclosed comment starting at %d", fmt.Errorf("unclosed comment starting at %d, (%d, %d)",
match.TC) match.TC, match.StartLine, match.StartColumn)
}, },
) )
@ -95,22 +95,23 @@ func TestSimple(x *testing.T) {
`) `)
expected := []*Token{ expected := []*Token{
{NAME, "name", []byte("name"), 3, 2, 3, 2, 6}, {NAME, "name", []byte("name"), 3, 2, 3, 2, 6, 2, 3, 2, 6},
{EQUALS, nil, []byte("="), 8, 2, 8, 2, 8}, {EQUALS, nil, []byte("="), 8, 2, 8, 2, 8, 2, 8, 2, 8},
{NUMBER, 10, []byte("10"), 10, 2, 10, 2, 11}, {NUMBER, 10, []byte("10"), 10, 2, 10, 2, 11, 2, 10, 2, 11},
{PRINT, nil, []byte("print"), 15, 3, 3, 3, 7}, {PRINT, nil, []byte("print"), 15, 3, 3, 3, 7, 3, 3, 3, 7},
{NAME, "name", []byte("name"), 21, 3, 9, 3, 12}, {NAME, "name", []byte("name"), 21, 3, 9, 3, 12, 3, 9, 3, 12},
{PRINT, nil, []byte("print"), 28, 4, 3, 4, 7}, {PRINT, nil, []byte("print"), 28, 4, 3, 4, 7, 4, 3, 4, 7},
{NAME, "fred", []byte("fred"), 34, 4, 9, 4, 12}, {NAME, "fred", []byte("fred"), 34, 4, 9, 4, 12, 4, 9, 4, 12},
{NAME, "name", []byte("name"), 41, 5, 3, 5, 6}, {NAME, "name", []byte("name"), 41, 5, 3, 5, 6, 5, 3, 5, 6},
{EQUALS, nil, []byte("="), 46, 5, 8, 5, 8}, {EQUALS, nil, []byte("="), 46, 5, 8, 5, 8, 5, 8, 5, 8},
{NUMBER, 12, []byte("12"), 47, 5, 9, 5, 10}, {NUMBER, 12, []byte("12"), 47, 5, 9, 5, 10, 5, 9, 5, 10},
{NAME, "printname", []byte("printname"), 112, 9, 11, 9, 19}, {NAME, "printname", []byte("printname"), 112, 9, 11, 9, 19, 9, 11, 9, 19},
{EQUALS, nil, []byte("="), 122, 9, 21, 9, 21}, {EQUALS, nil, []byte("="), 122, 9, 21, 9, 21, 9, 21, 9, 21},
{NUMBER, 13, []byte("13"), 124, 9, 23, 9, 24}, {NUMBER, 13, []byte("13"), 124, 9, 23, 9, 24, 9, 23, 9, 24},
{PRINT, nil, []byte("print"), 129, 10, 3, 10, 7}, {PRINT, nil, []byte("print"), 129, 10, 3, 10, 7, 10, 3, 10, 7},
{NAME, "printname", []byte("printname"), 135, 10, 9, 10, 17}, {NAME, "printname", []byte("printname"), 135, 10, 9, 10, 17, 10, 9, 10, 17},
} }
scan := func(lexer *Lexer) { scan := func(lexer *Lexer) {
scanner, err := lexer.Scanner(text) scanner, err := lexer.Scanner(text)
if err != nil { if err != nil {
@ -291,7 +292,6 @@ func TestRegression(t *testing.T) {
fmt.Printf("Token: %v\n", tok) fmt.Printf("Token: %v\n", tok)
found++ found++
} }
if found != test.tokens { if found != test.tokens {
t.Errorf("Expected exactly %v tokens got %v, ===\nErr: %v\nEOS: %v\nTC: %d\n", test.tokens, found, err, eos, scanner.TC) t.Errorf("Expected exactly %v tokens got %v, ===\nErr: %v\nEOS: %v\nTC: %d\n", test.tokens, found, err, eos, scanner.TC)
} }

View File

@ -34,6 +34,7 @@ func mapLineCols(text []byte) []lineCol {
// DFA state machine. If the lexing process fails the Scanner will return // DFA state machine. If the lexing process fails the Scanner will return
// an UnconsumedInput error. // an UnconsumedInput error.
func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAccepting, text []byte) Scanner { func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAccepting, text []byte) Scanner {
lineCols := mapLineCols(text)
done := false done := false
matchID := -1 matchID := -1
matchTC := -1 matchTC := -1
@ -61,15 +62,23 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
} }
state = trans[state][text[tc]] state = trans[state][text[tc]]
if state == errorState && matchID > -1 { if state == errorState && matchID > -1 {
startLC := lineCols[startTC]
endLC := lineCols[matchTC-1]
match := &Match{ match := &Match{
PC: matchID, PC: matchID,
TC: startTC, TC: startTC,
Bytes: text[startTC:matchTC], StartLine: startLC.line,
StartColumn: startLC.col,
EndLine: endLC.line,
EndColumn: endLC.col,
Bytes: text[startTC:matchTC],
} }
if matchTC == startTC { if matchTC == startTC {
err := &EmptyMatchError{ err := &EmptyMatchError{
MatchID: matchID, MatchID: matchID,
TC: startTC, TC: tc,
Line: startLC.line,
Column: startLC.col,
} }
return startTC, nil, err, scan return startTC, nil, err, scan
} }
@ -82,19 +91,30 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
matchTC = tc matchTC = tc
} }
if startTC <= len(text) && matchID > -1 && matchTC == startTC { if startTC <= len(text) && matchID > -1 && matchTC == startTC {
var startLC lineCol
if startTC < len(text) {
startLC = lineCols[startTC]
}
err := &EmptyMatchError{ err := &EmptyMatchError{
MatchID: matchID, MatchID: matchID,
TC: startTC, TC: tc,
Line: startLC.line,
Column: startLC.col,
} }
matchID = -1 matchID = -1
return startTC, nil, err, scan return startTC, nil, err, scan
} }
if startTC < len(text) && matchTC <= len(text) && matchID > -1 { if startTC < len(text) && matchTC <= len(text) && matchID > -1 {
startLC := lineCols[startTC]
endLC := lineCols[matchTC-1]
match := &Match{ match := &Match{
PC: matchID, PC: matchID,
TC: startTC, TC: startTC,
Bytes: text[startTC:matchTC], StartLine: startLC.line,
StartColumn: startLC.col,
EndLine: endLC.line,
EndColumn: endLC.col,
Bytes: text[startTC:matchTC],
} }
matchID = -1 matchID = -1
return matchTC, match, nil, scan return matchTC, match, nil, scan
@ -108,12 +128,22 @@ func DFALexerEngine(startState, errorState int, trans DFATrans, accepting DFAAcc
if matchTC == -1 { if matchTC == -1 {
matchTC = 0 matchTC = 0
} }
startLC := lineCols[startTC]
etc := tc etc := tc
var endLC lineCol
if etc >= len(lineCols) {
endLC = lineCols[len(lineCols)-1]
} else {
endLC = lineCols[etc]
}
err := &UnconsumedInput{ err := &UnconsumedInput{
StartTC: startTC, StartTC: startTC,
FailTC: etc, FailTC: etc,
Text: text, StartLine: startLC.line,
StartColumn: startLC.col,
FailLine: endLC.line,
FailColumn: endLC.col,
Text: text,
} }
return tc, nil, err, scan return tc, nil, err, scan
} else { } else {

View File

@ -13,20 +13,26 @@ import (
// string // string
type EmptyMatchError struct { type EmptyMatchError struct {
TC int TC int
Line int
Column int
MatchID int MatchID int
} }
func (e *EmptyMatchError) Error() string { func (e *EmptyMatchError) Error() string {
return fmt.Sprintf("Lexer error: matched the empty string at %d for match id %d.", return fmt.Sprintf("Lexer error: matched the empty string at %d:%d (tc=%d) for match id %d.",
e.TC, e.MatchID, e.Line, e.Column, e.TC, e.MatchID,
) )
} }
// UnconsumedInput error type // UnconsumedInput error type
type UnconsumedInput struct { type UnconsumedInput struct {
StartTC int StartTC int
FailTC int FailTC int
Text []byte StartLine int
StartColumn int
FailLine int
FailColumn int
Text []byte
} }
// Error implements the error interface // Error implements the error interface
@ -45,18 +51,60 @@ func (u *UnconsumedInput) Error() string {
} }
stc := min(u.StartTC, len(u.Text)-1) stc := min(u.StartTC, len(u.Text)-1)
etc := min(max(u.StartTC+1, u.FailTC), len(u.Text)) etc := min(max(u.StartTC+1, u.FailTC), len(u.Text))
return fmt.Sprintf("Lexer error: could not match text starting at %v failing at %v.\n\tunmatched text: %q", return fmt.Sprintf("Lexer error: could not match text starting at %v:%v failing at %v:%v.\n\tunmatched text: %q",
u.StartTC, u.StartLine, u.StartColumn,
u.FailTC, u.FailLine, u.FailColumn,
string(u.Text[stc:etc]), string(u.Text[stc:etc]),
) )
} }
// A Match represents the positional and textual information from a match. // A Match represents the positional and textual information from a match.
type Match struct { type Match struct {
PC int // pattern cursor PC int
TC int // start position of text cursor TC int
Bytes []byte // the actual bytes matched during scanning. StartLine int
StartColumn int
EndLine int
EndColumn int
Bytes []byte // the actual bytes matched during scanning.
TSLine, TSColumn, TELine, TEColumn int
}
func computeLineCol(text []byte, prevTC, tc, line, col int) (int, int) {
if tc < 0 {
return line, col
}
if tc < prevTC {
for i := prevTC; i > tc && i > 0; i-- {
if text[i] == '\n' {
line--
}
}
col = 0
for i := tc; i >= 0; i-- {
if text[i] == '\n' {
break
}
col++
}
return line, col
}
for i := prevTC + 1; i <= tc && i < len(text); i++ {
if text[i] == '\n' {
col = 0
line++
} else {
col++
}
}
if prevTC == tc && tc == 0 && tc < len(text) {
if text[tc] == '\n' {
line++
col--
}
}
return line, col
} }
// Equals checks two matches for equality // Equals checks two matches for equality
@ -69,13 +117,16 @@ func (m *Match) Equals(other *Match) bool {
return false return false
} }
return m.PC == other.PC && return m.PC == other.PC &&
m.TC == other.TC && m.StartLine == other.StartLine &&
m.StartColumn == other.StartColumn &&
m.EndLine == other.EndLine &&
m.EndColumn == other.EndColumn &&
bytes.Equal(m.Bytes, other.Bytes) bytes.Equal(m.Bytes, other.Bytes)
} }
// String formats the match for humans // String formats the match for humans
func (m Match) String() string { func (m Match) String() string {
return fmt.Sprintf("<Match %d %d %v'>", m.PC, m.TC, string(m.Bytes)) return fmt.Sprintf("<Match %d %d (%d, %d)-(%d, %d) '%v'>", m.PC, m.TC, m.StartLine, m.StartColumn, m.EndLine, m.EndColumn, string(m.Bytes))
} }
// Scanner is a functional iterator returned by the LexerEngine. See // Scanner is a functional iterator returned by the LexerEngine. See
@ -90,6 +141,10 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
matchPC := -1 matchPC := -1
matchTC := -1 matchTC := -1
prevTC := 0
line := 1
col := 1
var scan Scanner var scan Scanner
var cqueue, nqueue *queue.Queue = queue.New(len(program)), queue.New(len(program)) var cqueue, nqueue *queue.Queue = queue.New(len(program)), queue.New(len(program))
scan = func(tc int) (int, *Match, error, Scanner) { scan = func(tc int) (int, *Match, error, Scanner) {
@ -142,19 +197,27 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
} }
cqueue, nqueue = nqueue, cqueue cqueue, nqueue = nqueue, cqueue
if cqueue.Empty() && matchPC > -1 { if cqueue.Empty() && matchPC > -1 {
line, col = computeLineCol(text, prevTC, startTC, line, col)
eLine, eCol := computeLineCol(text, startTC, matchTC-1, line, col)
match := &Match{ match := &Match{
PC: matchPC, PC: matchPC,
TC: startTC, TC: startTC,
Bytes: text[startTC:matchTC], StartLine: line,
StartColumn: col,
EndLine: eLine,
EndColumn: eCol,
Bytes: text[startTC:matchTC],
} }
if matchTC == startTC { if matchTC == startTC {
err := &EmptyMatchError{ err := &EmptyMatchError{
MatchID: matchPC, MatchID: matchPC,
TC: tc, TC: tc,
Line: line,
Column: col,
} }
return startTC, nil, err, scan return startTC, nil, err, scan
} }
prevTC = startTC
matchPC = -1 matchPC = -1
return matchTC, match, nil, scan return matchTC, match, nil, scan
} }
@ -168,10 +231,16 @@ func LexerEngine(program inst.Slice, text []byte) Scanner {
if matchTC == -1 { if matchTC == -1 {
matchTC = 0 matchTC = 0
} }
sline, scol := computeLineCol(text, 0, startTC, 1, 1)
fline, fcol := computeLineCol(text, 0, tc, 1, 1)
err := &UnconsumedInput{ err := &UnconsumedInput{
StartTC: startTC, StartTC: startTC,
FailTC: tc, FailTC: tc,
Text: text, StartLine: sline,
StartColumn: scol,
FailLine: fline,
FailColumn: fcol,
Text: text,
} }
return tc, nil, err, scan return tc, nil, err, scan
} else { } else {

View File

@ -34,7 +34,7 @@ func TestLexerMatch(t *testing.T) {
t.Log(program) t.Log(program)
mtext := []byte("ababcbcbb") mtext := []byte("ababcbcbb")
expected := []Match{ expected := []Match{
{16, 0, mtext}, {16, 0, 1, 1, 1, len(mtext), mtext, 1, 0, 1, 1},
} }
i := 0 i := 0
for tc, m, err, scan := LexerEngine(program, text)(0); scan != nil; tc, m, err, scan = scan(tc) { for tc, m, err, scan := LexerEngine(program, text)(0); scan != nil; tc, m, err, scan = scan(tc) {
@ -114,9 +114,9 @@ func TestLexerThreeStrings(t *testing.T) {
t.Log(len(text)) t.Log(len(text))
t.Log(program) t.Log(program)
expected := []Match{ expected := []Match{
{8, 0, []byte("struct")}, {8, 0, 1, 1, 1, 6, []byte("struct"), 1, 0, 1, 1},
{13, 6, []byte(" ")}, {13, 6, 1, 7, 1, 8, []byte(" "), 1, 0, 1, 1},
{15, 8, []byte("*")}, {15, 8, 1, 9, 1, 9, []byte("*"), 1, 0, 1, 1},
} }
i := 0 i := 0
@ -165,9 +165,9 @@ func TestLexerRestart(t *testing.T) {
t.Log(len(text)) t.Log(len(text))
t.Log(program) t.Log(program)
expected := []Match{ expected := []Match{
{8, 0, []byte("struct")}, {8, 0, 1, 1, 1, 6, []byte("struct"), 1, 0, 1, 1},
{19, 6, []byte("\n ")}, {19, 6, 2, 0, 2, 2, []byte("\n "), 1, 0, 1, 1},
{21, 9, []byte("*")}, {21, 9, 2, 3, 2, 3, []byte("*"), 1, 0, 1, 1},
} }
check := func(m *Match, i int, err error) { check := func(m *Match, i int, err error) {