// Package machines implements the lexing algorithms. package machines import ( "bytes" "fmt" "gitea.xintech.co/zhouzhihong/lexmachine/inst" "gitea.xintech.co/zhouzhihong/lexmachine/queue" ) // EmptyMatchError is returned when a pattern would have matched the empty // string type EmptyMatchError struct { TC int MatchID int } func (e *EmptyMatchError) Error() string { return fmt.Sprintf("Lexer error: matched the empty string at %d for match id %d.", e.TC, e.MatchID, ) } // UnconsumedInput error type type UnconsumedInput struct { StartTC int FailTC int Text []byte } // Error implements the error interface func (u *UnconsumedInput) Error() string { min := func(a, b int) int { if a < b { return a } return b } max := func(a, b int) int { if a > b { return a } return b } stc := min(u.StartTC, len(u.Text)-1) etc := min(max(u.StartTC+1, u.FailTC), len(u.Text)) return fmt.Sprintf("Lexer error: could not match text starting at %v failing at %v.\n\tunmatched text: %q", u.StartTC, u.FailTC, string(u.Text[stc:etc]), ) } // A Match represents the positional and textual information from a match. type Match struct { PC int // pattern cursor TC int // start position of text cursor Bytes []byte // the actual bytes matched during scanning. } // Equals checks two matches for equality func (m *Match) Equals(other *Match) bool { if m == nil && other == nil { return true } else if m == nil { return false } else if other == nil { return false } return m.PC == other.PC && m.TC == other.TC && bytes.Equal(m.Bytes, other.Bytes) } // String formats the match for humans func (m Match) String() string { return fmt.Sprintf("", m.PC, m.TC, string(m.Bytes)) } // Scanner is a functional iterator returned by the LexerEngine. See // http://hackthology.com/functional-iteration-in-go.html type Scanner func(int) (int, *Match, error, Scanner) // LexerEngine does the actual tokenization of the byte slice text using the // NFA bytecode in program. If the lexing process fails the Scanner will return // an UnconsumedInput error. func LexerEngine(program inst.Slice, text []byte) Scanner { done := false matchPC := -1 matchTC := -1 var scan Scanner var cqueue, nqueue *queue.Queue = queue.New(len(program)), queue.New(len(program)) scan = func(tc int) (int, *Match, error, Scanner) { if done && tc == len(text) { return tc, nil, nil, nil } startTC := tc if tc < matchTC { // we back-tracked so reset the last matchTC matchTC = -1 } else if tc == matchTC { // the caller did not reset the tc, we are where we left } else if matchTC != -1 && tc > matchTC { // we skipped text matchTC = tc } cqueue.Clear() nqueue.Clear() cqueue.Push(0) for ; tc <= len(text); tc++ { if cqueue.Empty() { break } for !cqueue.Empty() { pc := cqueue.Pop() i := program[pc] switch i.Op { case inst.CHAR: x := byte(i.X) y := byte(i.Y) if tc < len(text) && x <= text[tc] && text[tc] <= y { nqueue.Push(pc + 1) } case inst.MATCH: if matchTC < tc { matchPC = int(pc) matchTC = tc } else if matchPC > int(pc) { matchPC = int(pc) matchTC = tc } case inst.JMP: cqueue.Push(i.X) case inst.SPLIT: cqueue.Push(i.X) cqueue.Push(i.Y) default: panic(fmt.Errorf("unexpected instruction %v", i)) } } cqueue, nqueue = nqueue, cqueue if cqueue.Empty() && matchPC > -1 { match := &Match{ PC: matchPC, TC: startTC, Bytes: text[startTC:matchTC], } if matchTC == startTC { err := &EmptyMatchError{ MatchID: matchPC, TC: tc, } return startTC, nil, err, scan } matchPC = -1 return matchTC, match, nil, scan } } if matchTC != len(text) && startTC >= len(text) { // the user has moved us farther than the text. Assume that was // the intent and return EOF. return tc, nil, nil, nil } else if matchTC != len(text) { done = true if matchTC == -1 { matchTC = 0 } err := &UnconsumedInput{ StartTC: startTC, FailTC: tc, Text: text, } return tc, nil, err, scan } else { return tc, nil, nil, nil } } return scan }