183 lines
4.2 KiB
Go
183 lines
4.2 KiB
Go
// Package machines implements the lexing algorithms.
|
|
package machines
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
|
|
"gitea.xintech.co/zhouzhihong/lexmachine/inst"
|
|
"gitea.xintech.co/zhouzhihong/lexmachine/queue"
|
|
)
|
|
|
|
// EmptyMatchError is returned when a pattern would have matched the empty
|
|
// string
|
|
type EmptyMatchError struct {
|
|
TC int
|
|
MatchID int
|
|
}
|
|
|
|
func (e *EmptyMatchError) Error() string {
|
|
return fmt.Sprintf("Lexer error: matched the empty string at %d for match id %d.",
|
|
e.TC, e.MatchID,
|
|
)
|
|
}
|
|
|
|
// UnconsumedInput error type
|
|
type UnconsumedInput struct {
|
|
StartTC int
|
|
FailTC int
|
|
Text []byte
|
|
}
|
|
|
|
// Error implements the error interface
|
|
func (u *UnconsumedInput) Error() string {
|
|
min := func(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
max := func(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
stc := min(u.StartTC, len(u.Text)-1)
|
|
etc := min(max(u.StartTC+1, u.FailTC), len(u.Text))
|
|
return fmt.Sprintf("Lexer error: could not match text starting at %v failing at %v.\n\tunmatched text: %q",
|
|
u.StartTC,
|
|
u.FailTC,
|
|
string(u.Text[stc:etc]),
|
|
)
|
|
}
|
|
|
|
// A Match represents the positional and textual information from a match.
|
|
type Match struct {
|
|
PC int // pattern cursor
|
|
TC int // start position of text cursor
|
|
Bytes []byte // the actual bytes matched during scanning.
|
|
}
|
|
|
|
// Equals checks two matches for equality
|
|
func (m *Match) Equals(other *Match) bool {
|
|
if m == nil && other == nil {
|
|
return true
|
|
} else if m == nil {
|
|
return false
|
|
} else if other == nil {
|
|
return false
|
|
}
|
|
return m.PC == other.PC &&
|
|
m.TC == other.TC &&
|
|
bytes.Equal(m.Bytes, other.Bytes)
|
|
}
|
|
|
|
// String formats the match for humans
|
|
func (m Match) String() string {
|
|
return fmt.Sprintf("<Match %d %d %v'>", m.PC, m.TC, string(m.Bytes))
|
|
}
|
|
|
|
// Scanner is a functional iterator returned by the LexerEngine. See
|
|
// http://hackthology.com/functional-iteration-in-go.html
|
|
type Scanner func(int) (int, *Match, error, Scanner)
|
|
|
|
// LexerEngine does the actual tokenization of the byte slice text using the
|
|
// NFA bytecode in program. If the lexing process fails the Scanner will return
|
|
// an UnconsumedInput error.
|
|
func LexerEngine(program inst.Slice, text []byte) Scanner {
|
|
done := false
|
|
matchPC := -1
|
|
matchTC := -1
|
|
|
|
var scan Scanner
|
|
var cqueue, nqueue *queue.Queue = queue.New(len(program)), queue.New(len(program))
|
|
scan = func(tc int) (int, *Match, error, Scanner) {
|
|
if done && tc == len(text) {
|
|
return tc, nil, nil, nil
|
|
}
|
|
startTC := tc
|
|
if tc < matchTC {
|
|
// we back-tracked so reset the last matchTC
|
|
matchTC = -1
|
|
} else if tc == matchTC {
|
|
// the caller did not reset the tc, we are where we left
|
|
} else if matchTC != -1 && tc > matchTC {
|
|
// we skipped text
|
|
matchTC = tc
|
|
}
|
|
cqueue.Clear()
|
|
nqueue.Clear()
|
|
cqueue.Push(0)
|
|
for ; tc <= len(text); tc++ {
|
|
if cqueue.Empty() {
|
|
break
|
|
}
|
|
for !cqueue.Empty() {
|
|
pc := cqueue.Pop()
|
|
i := program[pc]
|
|
switch i.Op {
|
|
case inst.CHAR:
|
|
x := byte(i.X)
|
|
y := byte(i.Y)
|
|
if tc < len(text) && x <= text[tc] && text[tc] <= y {
|
|
nqueue.Push(pc + 1)
|
|
}
|
|
case inst.MATCH:
|
|
if matchTC < tc {
|
|
matchPC = int(pc)
|
|
matchTC = tc
|
|
} else if matchPC > int(pc) {
|
|
matchPC = int(pc)
|
|
matchTC = tc
|
|
}
|
|
case inst.JMP:
|
|
cqueue.Push(i.X)
|
|
case inst.SPLIT:
|
|
cqueue.Push(i.X)
|
|
cqueue.Push(i.Y)
|
|
default:
|
|
panic(fmt.Errorf("unexpected instruction %v", i))
|
|
}
|
|
}
|
|
cqueue, nqueue = nqueue, cqueue
|
|
if cqueue.Empty() && matchPC > -1 {
|
|
|
|
match := &Match{
|
|
PC: matchPC,
|
|
TC: startTC,
|
|
Bytes: text[startTC:matchTC],
|
|
}
|
|
if matchTC == startTC {
|
|
err := &EmptyMatchError{
|
|
MatchID: matchPC,
|
|
TC: tc,
|
|
}
|
|
return startTC, nil, err, scan
|
|
}
|
|
matchPC = -1
|
|
return matchTC, match, nil, scan
|
|
}
|
|
}
|
|
if matchTC != len(text) && startTC >= len(text) {
|
|
// the user has moved us farther than the text. Assume that was
|
|
// the intent and return EOF.
|
|
return tc, nil, nil, nil
|
|
} else if matchTC != len(text) {
|
|
done = true
|
|
if matchTC == -1 {
|
|
matchTC = 0
|
|
}
|
|
err := &UnconsumedInput{
|
|
StartTC: startTC,
|
|
FailTC: tc,
|
|
Text: text,
|
|
}
|
|
return tc, nil, err, scan
|
|
} else {
|
|
return tc, nil, nil, nil
|
|
}
|
|
}
|
|
return scan
|
|
}
|