1. 增加utf8文本的行数和列数计算字段 Co-authored-by: zhouzhihong <zhouzhihong@ijunhai.com> Reviewed-on: https://gitea.xintech.co/zhouzhihong/lexmachine/pulls/1
572 lines
13 KiB
Go
572 lines
13 KiB
Go
// Package frontend parses regular expressions and compiles them into NFA
|
|
// bytecode
|
|
package frontend
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
"runtime"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// Turn on debug prints
|
|
var DEBUG = false
|
|
|
|
// ParseError gives structured errors for parsing problems.
|
|
type ParseError struct {
|
|
Reason string
|
|
Production string
|
|
TC int
|
|
text []byte
|
|
chain []*ParseError
|
|
}
|
|
|
|
// Errorf constructs a parse error with format for a particular location.
|
|
func Errorf(text []byte, tc int, format string, args ...interface{}) *ParseError {
|
|
pc, _, _, ok := runtime.Caller(1)
|
|
return errorf(pc, ok, text, tc, format, args...)
|
|
}
|
|
|
|
func matchErrorf(text []byte, tc int, format string, args ...interface{}) *ParseError {
|
|
pc, _, _, ok := runtime.Caller(2)
|
|
return errorf(pc, ok, text, tc, format, args...)
|
|
}
|
|
|
|
func errorf(pc uintptr, ok bool, text []byte, tc int, format string, args ...interface{}) *ParseError {
|
|
var fn = "unknown"
|
|
if ok {
|
|
fn = runtime.FuncForPC(pc).Name()
|
|
split := strings.Split(fn, ".")
|
|
fn = split[len(split)-1]
|
|
}
|
|
msg := fmt.Sprintf(format, args...)
|
|
return &ParseError{
|
|
Reason: msg,
|
|
Production: fn,
|
|
TC: tc,
|
|
text: text,
|
|
}
|
|
}
|
|
|
|
// Error implements the error interface
|
|
func (p *ParseError) Error() string {
|
|
errs := make([]string, 0, len(p.chain)+1)
|
|
for i := len(p.chain) - 1; i >= 0; i-- {
|
|
errs = append(errs, p.chain[i].Error())
|
|
}
|
|
errs = append(errs, p.error())
|
|
return strings.Join(errs, "\n")
|
|
}
|
|
|
|
func (p *ParseError) error() string {
|
|
line, col := LineCol(p.text, p.TC)
|
|
return fmt.Sprintf("Regex parse error in production '%v' : at index %v line %v column %v '%s' : %v",
|
|
p.Production, p.TC, line, col, p.text[p.TC:], p.Reason)
|
|
}
|
|
|
|
// String formats the error for humans
|
|
func (p *ParseError) String() string {
|
|
return p.Error()
|
|
}
|
|
|
|
// Chain joins multiple ParseErrors together
|
|
func (p *ParseError) Chain(e *ParseError) *ParseError {
|
|
p.chain = append(p.chain, e)
|
|
return p
|
|
}
|
|
|
|
// LineCol computes the line and column of a particular index inside of a byte
|
|
// slice.
|
|
func LineCol(text []byte, tc int) (line int, col int) {
|
|
for i := 0; i <= tc && i < len(text); i++ {
|
|
if text[i] == '\n' {
|
|
col = 0
|
|
line++
|
|
} else {
|
|
col++
|
|
}
|
|
}
|
|
if tc == 0 && tc < len(text) {
|
|
if text[tc] == '\n' {
|
|
line++
|
|
col--
|
|
}
|
|
}
|
|
return line, col
|
|
}
|
|
|
|
// Parse a regular expression into an Abstract Syntax Tree (AST)
|
|
func Parse(text []byte) (AST, error) {
|
|
a, err := (&parser{
|
|
text: text,
|
|
lastError: Errorf(text, 0, "unconsumed input"),
|
|
}).regex()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return a, nil
|
|
}
|
|
|
|
type parser struct {
|
|
text []byte
|
|
lastError *ParseError
|
|
}
|
|
|
|
func (p *parser) regex() (AST, *ParseError) {
|
|
i, ast, err := p.alternation(0)
|
|
if err != nil {
|
|
return nil, err
|
|
} else if i != len(p.text) {
|
|
return nil, p.lastError
|
|
}
|
|
return NewMatch(ast), nil
|
|
}
|
|
|
|
func (p *parser) alternation(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter alternation %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit alternation %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, A, err := p.atomicOps(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, B, err := p.alternation_(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, NewAlternation(A, B), nil
|
|
}
|
|
|
|
func (p *parser) alternation_(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter alternation_ %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit alternation_ %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
if i >= len(p.text) {
|
|
return i, nil, nil
|
|
}
|
|
i, err := p.match(i, '|')
|
|
if err != nil {
|
|
return i, nil, nil
|
|
}
|
|
i, A, err := p.atomicOps(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, B, err := p.alternation_(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, NewAlternation(A, B), nil
|
|
}
|
|
|
|
func (p *parser) atomicOps(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter atomicOps %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit atomicOps %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
if i >= len(p.text) {
|
|
return i, nil, nil
|
|
}
|
|
i, A, err := p.atomicOp(i)
|
|
if err != nil {
|
|
p.lastError.Chain(err)
|
|
return i, nil, nil
|
|
}
|
|
i, B, err := p.atomicOps(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, NewConcat(A, B), nil
|
|
}
|
|
|
|
func (p *parser) atomicOp(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter atomicOp %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit atomicOp %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, A, err := p.atomic(i)
|
|
if DEBUG {
|
|
log.Printf("atomic %v", err)
|
|
}
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, OPS, err := p.ops(i)
|
|
if err != nil && err.Reason == "No Operator" {
|
|
return i, A, nil
|
|
} else if err != nil {
|
|
return i, A, err
|
|
}
|
|
var N = A
|
|
for _, OP := range OPS {
|
|
N = NewApplyOp(OP, N)
|
|
}
|
|
return i, N, err
|
|
}
|
|
|
|
func (p *parser) ops(i int) (int, []AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter ops %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit ops %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
ops := make([]AST, 0, 2)
|
|
var err *ParseError
|
|
var O AST
|
|
for {
|
|
i, O, err = p.op(i)
|
|
if err != nil {
|
|
if len(ops) <= 0 {
|
|
return i, nil, err
|
|
}
|
|
return i, ops, nil
|
|
}
|
|
ops = append(ops, O)
|
|
}
|
|
}
|
|
|
|
func (p *parser) op(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter op %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit op %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, err := p.match(i, '+')
|
|
if err == nil {
|
|
return i, NewOp("+"), nil
|
|
}
|
|
i, err = p.match(i, '*')
|
|
if err == nil {
|
|
return i, NewOp("*"), nil
|
|
}
|
|
i, err = p.match(i, '?')
|
|
if err == nil {
|
|
return i, NewOp("?"), nil
|
|
}
|
|
return i, nil, Errorf(p.text, i, "No Operator")
|
|
}
|
|
|
|
func (p *parser) atomic(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter atomic %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit atomic %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, ast, errChar := p.char(i)
|
|
if errChar == nil {
|
|
return i, ast, nil
|
|
}
|
|
if DEBUG {
|
|
log.Printf("char %v", errChar)
|
|
}
|
|
i, ast, errGroup := p.group(i)
|
|
if errGroup == nil {
|
|
return i, ast, nil
|
|
}
|
|
if DEBUG {
|
|
log.Printf("group %v", errGroup)
|
|
}
|
|
return i, nil, Errorf(p.text, i, "Expected group or char").Chain(errChar).Chain(errGroup)
|
|
}
|
|
|
|
func (p *parser) group(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter group %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit group %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, err := p.match(i, '(')
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, A, err := p.alternation(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, err = p.match(i, ')')
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, A, nil
|
|
}
|
|
|
|
func (p *parser) char(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter char %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit char %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
i, C, errCHAR := p.CHAR(i)
|
|
if errCHAR == nil {
|
|
return i, C, nil
|
|
}
|
|
i, R, errRange := p.charClass(i)
|
|
if errRange == nil {
|
|
return i, R, nil
|
|
}
|
|
return i, nil, Errorf(p.text, i,
|
|
"Expected a CHAR or charRange at %d, %v", i, string(p.text)).Chain(errCHAR).Chain(errRange)
|
|
}
|
|
|
|
// The CHAR token
|
|
func (p *parser) CHAR(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter CHAR %v '%v'", i, string(p.text[i:]))
|
|
defer func() {
|
|
log.Printf("exit CHAR %v '%v'", i, string(p.text[i:]))
|
|
}()
|
|
}
|
|
if i >= len(p.text) {
|
|
return i, nil, Errorf(p.text, i, "out of input %v, %v", i, string(p.text))
|
|
}
|
|
if p.text[i] == '\\' {
|
|
i, cls, err := p.builtInClass(i)
|
|
if err == nil {
|
|
return i, cls, nil
|
|
}
|
|
i, b, err := p.getByte(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, NewCharacter(b), nil
|
|
}
|
|
switch p.text[i] {
|
|
case '|', '+', '*', '?', '(', ')', '[', ']', '^':
|
|
return i, nil, Errorf(p.text, i,
|
|
"unexpected operator, %s", string([]byte{p.text[i]}))
|
|
case '.':
|
|
return i + 1, NewAny(), nil
|
|
default:
|
|
return i + 1, NewCharacter(p.text[i]), nil
|
|
}
|
|
}
|
|
|
|
var (
|
|
builtInd = canonizeRanges([]*Range{NewRange(48, 57)})
|
|
builtInD = invertRanges(builtInd)
|
|
builtIns = canonizeRanges([]*Range{
|
|
NewRange(9, 9), // \t
|
|
NewRange(10, 10), // \n
|
|
NewRange(12, 12), // \f
|
|
NewRange(13, 13), // \r
|
|
NewRange(32, 32), // ' ' (a space)
|
|
})
|
|
builtInS = invertRanges(builtIns)
|
|
builtInw = canonizeRanges([]*Range{
|
|
NewRange(48, 57), // 0-9
|
|
NewRange(65, 90), // A-Z
|
|
NewRange(97, 122), // a-z
|
|
NewRange(95, 95), // _
|
|
})
|
|
builtInW = invertRanges(builtInw)
|
|
)
|
|
|
|
func (p *parser) builtInClass(i int) (int, AST, *ParseError) {
|
|
if p.text[i] != '\\' {
|
|
return i, nil, Errorf(p.text, i, "Not the start of built-in character class %q", string([]byte{p.text[i]}))
|
|
}
|
|
if i+1 < len(p.text) {
|
|
if p.text[i+1] == 'd' {
|
|
return i + 2, rangesToAST(builtInd), nil
|
|
} else if p.text[i+1] == 'D' {
|
|
return i + 2, rangesToAST(builtInD), nil
|
|
} else if p.text[i+1] == 's' {
|
|
return i + 2, rangesToAST(builtIns), nil
|
|
} else if p.text[i+1] == 'S' {
|
|
return i + 2, rangesToAST(builtInS), nil
|
|
} else if p.text[i+1] == 'w' {
|
|
return i + 2, rangesToAST(builtInw), nil
|
|
} else if p.text[i+1] == 'W' {
|
|
return i + 2, rangesToAST(builtInW), nil
|
|
}
|
|
return i, nil, Errorf(p.text, i, "Unknown class %q", string([]byte{p.text[i+1]}))
|
|
}
|
|
return i, nil, Errorf(p.text, i, "Unexpected EOS")
|
|
}
|
|
|
|
func (p *parser) getByte(i int) (int, byte, *ParseError) {
|
|
i, err := p.match(i, '\\')
|
|
if err == nil {
|
|
if i >= len(p.text) {
|
|
return len(p.text), p.text[len(p.text)-1], nil
|
|
} else if i < len(p.text) && p.text[i] == 'n' {
|
|
return i + 1, '\n', nil
|
|
} else if i < len(p.text) && p.text[i] == 'r' {
|
|
return i + 1, '\r', nil
|
|
} else if i < len(p.text) && p.text[i] == 't' {
|
|
return i + 1, '\t', nil
|
|
}
|
|
return i + 1, p.text[i], nil
|
|
}
|
|
if i >= len(p.text) {
|
|
return i, 0, Errorf(p.text, i, "ran out of p.text at %d", i)
|
|
}
|
|
return i + 1, p.text[i], nil
|
|
}
|
|
|
|
func (p *parser) charClass(i int) (int, AST, *ParseError) {
|
|
if DEBUG {
|
|
log.Printf("enter charRange %v '%v'", i, string(p.text[i:]))
|
|
}
|
|
exclude := false
|
|
i, err := p.match(i, '[')
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, err = p.match(i, '^')
|
|
if err == nil {
|
|
exclude = true
|
|
}
|
|
ranges := make([]*Range, 0, 10)
|
|
for {
|
|
var r *Range
|
|
i, r, err = p.charClassItem(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
ranges = append(ranges, r)
|
|
if i < len(p.text) && p.text[i] == ']' {
|
|
break
|
|
} else if i >= len(p.text) {
|
|
break
|
|
}
|
|
}
|
|
i, err = p.match(i, ']')
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
ranges = canonizeRanges(ranges)
|
|
if exclude {
|
|
ranges = invertRanges(ranges)
|
|
}
|
|
ast := rangesToAST(ranges)
|
|
return i, ast, err
|
|
}
|
|
|
|
func (p *parser) charClassItem(i int) (int, *Range, *ParseError) {
|
|
i, S, err := p.getByte(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
i, err = p.match(i, '-')
|
|
if err != nil {
|
|
return i, NewRange(S, S), nil
|
|
}
|
|
i, T, err := p.getByte(i)
|
|
if err != nil {
|
|
return i, nil, err
|
|
}
|
|
return i, NewRange(S, T), nil
|
|
}
|
|
|
|
func canonizeRanges(from []*Range) []*Range {
|
|
sort.SliceStable(from, func(i, j int) bool {
|
|
return from[i].From < from[j].From || (from[i].From == from[j].From && from[i].To < from[j].To)
|
|
})
|
|
to := make([]*Range, 0, len(from))
|
|
var prev *Range
|
|
for _, r := range from {
|
|
if prev != nil {
|
|
if prev.To+1 >= r.From {
|
|
if prev.From >= r.From {
|
|
// no change drop r it is a subset or prev
|
|
} else {
|
|
// extend prev to the end of r
|
|
prev.To = r.To
|
|
}
|
|
} else {
|
|
// r start after prev ends add a new range
|
|
to = append(to, r)
|
|
prev = r
|
|
}
|
|
} else {
|
|
to = append(to, r)
|
|
prev = r
|
|
}
|
|
}
|
|
return to
|
|
}
|
|
|
|
// Expects p.combineOverlaps() to have been run on orig s.t. there are no
|
|
// overlapping ranges, the ranges are sorted, and all ranges are separated by at
|
|
// least one character.
|
|
func invertRanges(orig []*Range) []*Range {
|
|
if len(orig) <= 0 {
|
|
return []*Range{NewAny()}
|
|
}
|
|
invrt := make([]*Range, 0, len(orig)+1)
|
|
if orig[0].From > 0 {
|
|
invrt = append(invrt, NewRange(0, orig[0].From-1))
|
|
}
|
|
for i := 0; i+1 < len(orig); i++ {
|
|
if orig[i].To < 255 {
|
|
invrt = append(invrt, NewRange(orig[i].To+1, orig[i+1].From-1))
|
|
}
|
|
}
|
|
if orig[len(orig)-1].To < 255 {
|
|
invrt = append(invrt, NewRange(orig[len(orig)-1].To+1, 255))
|
|
}
|
|
return invrt
|
|
}
|
|
|
|
// Expects p.combineOverlaps() to have been run on orig s.t. there are no
|
|
// overlapping ranges, the ranges are sorted, and all ranges are separated by at
|
|
// least one character.
|
|
func rangesToAST(ranges []*Range) AST {
|
|
if len(ranges) == 0 {
|
|
panic("no ranges")
|
|
} else if len(ranges) == 1 {
|
|
return ranges[0]
|
|
}
|
|
ast := NewAlternation(
|
|
ranges[len(ranges)-2],
|
|
ranges[len(ranges)-1],
|
|
)
|
|
for j := len(ranges) - 3; j >= 0; j-- {
|
|
ast = NewAlternation(
|
|
ranges[j],
|
|
ast,
|
|
)
|
|
}
|
|
return ast
|
|
}
|
|
|
|
func (p *parser) matchAny(i int) (int, *Character, *ParseError) {
|
|
if i >= len(p.text) {
|
|
return i, nil, Errorf(p.text, i, "out of p.text, %d", i)
|
|
}
|
|
return i + 1, NewCharacter(p.text[i]), nil
|
|
}
|
|
|
|
func (p *parser) match(i int, c byte) (int, *ParseError) {
|
|
if i >= len(p.text) {
|
|
return i, matchErrorf(p.text, i, "out of p.text, %d", i)
|
|
} else if p.text[i] == c {
|
|
i++
|
|
return i, nil
|
|
}
|
|
return i, matchErrorf(p.text, i,
|
|
"expected '%v' at %v got '%v' of '%v'",
|
|
string([]byte{c}),
|
|
i,
|
|
string(p.text[i:i+1]),
|
|
string(p.text[i:]),
|
|
)
|
|
}
|