lexmachine/frontend/parser.go
zhouzhihong 879a86b6b7 新特性 (#1)
1. 增加utf8文本的行数和列数计算字段

Co-authored-by: zhouzhihong <zhouzhihong@ijunhai.com>
Reviewed-on: https://gitea.xintech.co/zhouzhihong/lexmachine/pulls/1
2022-08-25 14:41:32 +08:00

572 lines
13 KiB
Go

// Package frontend parses regular expressions and compiles them into NFA
// bytecode
package frontend
import (
"fmt"
"log"
"runtime"
"sort"
"strings"
)
// Turn on debug prints
var DEBUG = false
// ParseError gives structured errors for parsing problems.
type ParseError struct {
Reason string
Production string
TC int
text []byte
chain []*ParseError
}
// Errorf constructs a parse error with format for a particular location.
func Errorf(text []byte, tc int, format string, args ...interface{}) *ParseError {
pc, _, _, ok := runtime.Caller(1)
return errorf(pc, ok, text, tc, format, args...)
}
func matchErrorf(text []byte, tc int, format string, args ...interface{}) *ParseError {
pc, _, _, ok := runtime.Caller(2)
return errorf(pc, ok, text, tc, format, args...)
}
func errorf(pc uintptr, ok bool, text []byte, tc int, format string, args ...interface{}) *ParseError {
var fn = "unknown"
if ok {
fn = runtime.FuncForPC(pc).Name()
split := strings.Split(fn, ".")
fn = split[len(split)-1]
}
msg := fmt.Sprintf(format, args...)
return &ParseError{
Reason: msg,
Production: fn,
TC: tc,
text: text,
}
}
// Error implements the error interface
func (p *ParseError) Error() string {
errs := make([]string, 0, len(p.chain)+1)
for i := len(p.chain) - 1; i >= 0; i-- {
errs = append(errs, p.chain[i].Error())
}
errs = append(errs, p.error())
return strings.Join(errs, "\n")
}
func (p *ParseError) error() string {
line, col := LineCol(p.text, p.TC)
return fmt.Sprintf("Regex parse error in production '%v' : at index %v line %v column %v '%s' : %v",
p.Production, p.TC, line, col, p.text[p.TC:], p.Reason)
}
// String formats the error for humans
func (p *ParseError) String() string {
return p.Error()
}
// Chain joins multiple ParseErrors together
func (p *ParseError) Chain(e *ParseError) *ParseError {
p.chain = append(p.chain, e)
return p
}
// LineCol computes the line and column of a particular index inside of a byte
// slice.
func LineCol(text []byte, tc int) (line int, col int) {
for i := 0; i <= tc && i < len(text); i++ {
if text[i] == '\n' {
col = 0
line++
} else {
col++
}
}
if tc == 0 && tc < len(text) {
if text[tc] == '\n' {
line++
col--
}
}
return line, col
}
// Parse a regular expression into an Abstract Syntax Tree (AST)
func Parse(text []byte) (AST, error) {
a, err := (&parser{
text: text,
lastError: Errorf(text, 0, "unconsumed input"),
}).regex()
if err != nil {
return nil, err
}
return a, nil
}
type parser struct {
text []byte
lastError *ParseError
}
func (p *parser) regex() (AST, *ParseError) {
i, ast, err := p.alternation(0)
if err != nil {
return nil, err
} else if i != len(p.text) {
return nil, p.lastError
}
return NewMatch(ast), nil
}
func (p *parser) alternation(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter alternation %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit alternation %v '%v'", i, string(p.text[i:]))
}()
}
i, A, err := p.atomicOps(i)
if err != nil {
return i, nil, err
}
i, B, err := p.alternation_(i)
if err != nil {
return i, nil, err
}
return i, NewAlternation(A, B), nil
}
func (p *parser) alternation_(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter alternation_ %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit alternation_ %v '%v'", i, string(p.text[i:]))
}()
}
if i >= len(p.text) {
return i, nil, nil
}
i, err := p.match(i, '|')
if err != nil {
return i, nil, nil
}
i, A, err := p.atomicOps(i)
if err != nil {
return i, nil, err
}
i, B, err := p.alternation_(i)
if err != nil {
return i, nil, err
}
return i, NewAlternation(A, B), nil
}
func (p *parser) atomicOps(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter atomicOps %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit atomicOps %v '%v'", i, string(p.text[i:]))
}()
}
if i >= len(p.text) {
return i, nil, nil
}
i, A, err := p.atomicOp(i)
if err != nil {
p.lastError.Chain(err)
return i, nil, nil
}
i, B, err := p.atomicOps(i)
if err != nil {
return i, nil, err
}
return i, NewConcat(A, B), nil
}
func (p *parser) atomicOp(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter atomicOp %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit atomicOp %v '%v'", i, string(p.text[i:]))
}()
}
i, A, err := p.atomic(i)
if DEBUG {
log.Printf("atomic %v", err)
}
if err != nil {
return i, nil, err
}
i, OPS, err := p.ops(i)
if err != nil && err.Reason == "No Operator" {
return i, A, nil
} else if err != nil {
return i, A, err
}
var N = A
for _, OP := range OPS {
N = NewApplyOp(OP, N)
}
return i, N, err
}
func (p *parser) ops(i int) (int, []AST, *ParseError) {
if DEBUG {
log.Printf("enter ops %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit ops %v '%v'", i, string(p.text[i:]))
}()
}
ops := make([]AST, 0, 2)
var err *ParseError
var O AST
for {
i, O, err = p.op(i)
if err != nil {
if len(ops) <= 0 {
return i, nil, err
}
return i, ops, nil
}
ops = append(ops, O)
}
}
func (p *parser) op(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter op %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit op %v '%v'", i, string(p.text[i:]))
}()
}
i, err := p.match(i, '+')
if err == nil {
return i, NewOp("+"), nil
}
i, err = p.match(i, '*')
if err == nil {
return i, NewOp("*"), nil
}
i, err = p.match(i, '?')
if err == nil {
return i, NewOp("?"), nil
}
return i, nil, Errorf(p.text, i, "No Operator")
}
func (p *parser) atomic(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter atomic %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit atomic %v '%v'", i, string(p.text[i:]))
}()
}
i, ast, errChar := p.char(i)
if errChar == nil {
return i, ast, nil
}
if DEBUG {
log.Printf("char %v", errChar)
}
i, ast, errGroup := p.group(i)
if errGroup == nil {
return i, ast, nil
}
if DEBUG {
log.Printf("group %v", errGroup)
}
return i, nil, Errorf(p.text, i, "Expected group or char").Chain(errChar).Chain(errGroup)
}
func (p *parser) group(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter group %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit group %v '%v'", i, string(p.text[i:]))
}()
}
i, err := p.match(i, '(')
if err != nil {
return i, nil, err
}
i, A, err := p.alternation(i)
if err != nil {
return i, nil, err
}
i, err = p.match(i, ')')
if err != nil {
return i, nil, err
}
return i, A, nil
}
func (p *parser) char(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter char %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit char %v '%v'", i, string(p.text[i:]))
}()
}
i, C, errCHAR := p.CHAR(i)
if errCHAR == nil {
return i, C, nil
}
i, R, errRange := p.charClass(i)
if errRange == nil {
return i, R, nil
}
return i, nil, Errorf(p.text, i,
"Expected a CHAR or charRange at %d, %v", i, string(p.text)).Chain(errCHAR).Chain(errRange)
}
// The CHAR token
func (p *parser) CHAR(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter CHAR %v '%v'", i, string(p.text[i:]))
defer func() {
log.Printf("exit CHAR %v '%v'", i, string(p.text[i:]))
}()
}
if i >= len(p.text) {
return i, nil, Errorf(p.text, i, "out of input %v, %v", i, string(p.text))
}
if p.text[i] == '\\' {
i, cls, err := p.builtInClass(i)
if err == nil {
return i, cls, nil
}
i, b, err := p.getByte(i)
if err != nil {
return i, nil, err
}
return i, NewCharacter(b), nil
}
switch p.text[i] {
case '|', '+', '*', '?', '(', ')', '[', ']', '^':
return i, nil, Errorf(p.text, i,
"unexpected operator, %s", string([]byte{p.text[i]}))
case '.':
return i + 1, NewAny(), nil
default:
return i + 1, NewCharacter(p.text[i]), nil
}
}
var (
builtInd = canonizeRanges([]*Range{NewRange(48, 57)})
builtInD = invertRanges(builtInd)
builtIns = canonizeRanges([]*Range{
NewRange(9, 9), // \t
NewRange(10, 10), // \n
NewRange(12, 12), // \f
NewRange(13, 13), // \r
NewRange(32, 32), // ' ' (a space)
})
builtInS = invertRanges(builtIns)
builtInw = canonizeRanges([]*Range{
NewRange(48, 57), // 0-9
NewRange(65, 90), // A-Z
NewRange(97, 122), // a-z
NewRange(95, 95), // _
})
builtInW = invertRanges(builtInw)
)
func (p *parser) builtInClass(i int) (int, AST, *ParseError) {
if p.text[i] != '\\' {
return i, nil, Errorf(p.text, i, "Not the start of built-in character class %q", string([]byte{p.text[i]}))
}
if i+1 < len(p.text) {
if p.text[i+1] == 'd' {
return i + 2, rangesToAST(builtInd), nil
} else if p.text[i+1] == 'D' {
return i + 2, rangesToAST(builtInD), nil
} else if p.text[i+1] == 's' {
return i + 2, rangesToAST(builtIns), nil
} else if p.text[i+1] == 'S' {
return i + 2, rangesToAST(builtInS), nil
} else if p.text[i+1] == 'w' {
return i + 2, rangesToAST(builtInw), nil
} else if p.text[i+1] == 'W' {
return i + 2, rangesToAST(builtInW), nil
}
return i, nil, Errorf(p.text, i, "Unknown class %q", string([]byte{p.text[i+1]}))
}
return i, nil, Errorf(p.text, i, "Unexpected EOS")
}
func (p *parser) getByte(i int) (int, byte, *ParseError) {
i, err := p.match(i, '\\')
if err == nil {
if i >= len(p.text) {
return len(p.text), p.text[len(p.text)-1], nil
} else if i < len(p.text) && p.text[i] == 'n' {
return i + 1, '\n', nil
} else if i < len(p.text) && p.text[i] == 'r' {
return i + 1, '\r', nil
} else if i < len(p.text) && p.text[i] == 't' {
return i + 1, '\t', nil
}
return i + 1, p.text[i], nil
}
if i >= len(p.text) {
return i, 0, Errorf(p.text, i, "ran out of p.text at %d", i)
}
return i + 1, p.text[i], nil
}
func (p *parser) charClass(i int) (int, AST, *ParseError) {
if DEBUG {
log.Printf("enter charRange %v '%v'", i, string(p.text[i:]))
}
exclude := false
i, err := p.match(i, '[')
if err != nil {
return i, nil, err
}
i, err = p.match(i, '^')
if err == nil {
exclude = true
}
ranges := make([]*Range, 0, 10)
for {
var r *Range
i, r, err = p.charClassItem(i)
if err != nil {
return i, nil, err
}
ranges = append(ranges, r)
if i < len(p.text) && p.text[i] == ']' {
break
} else if i >= len(p.text) {
break
}
}
i, err = p.match(i, ']')
if err != nil {
return i, nil, err
}
ranges = canonizeRanges(ranges)
if exclude {
ranges = invertRanges(ranges)
}
ast := rangesToAST(ranges)
return i, ast, err
}
func (p *parser) charClassItem(i int) (int, *Range, *ParseError) {
i, S, err := p.getByte(i)
if err != nil {
return i, nil, err
}
i, err = p.match(i, '-')
if err != nil {
return i, NewRange(S, S), nil
}
i, T, err := p.getByte(i)
if err != nil {
return i, nil, err
}
return i, NewRange(S, T), nil
}
func canonizeRanges(from []*Range) []*Range {
sort.SliceStable(from, func(i, j int) bool {
return from[i].From < from[j].From || (from[i].From == from[j].From && from[i].To < from[j].To)
})
to := make([]*Range, 0, len(from))
var prev *Range
for _, r := range from {
if prev != nil {
if prev.To+1 >= r.From {
if prev.From >= r.From {
// no change drop r it is a subset or prev
} else {
// extend prev to the end of r
prev.To = r.To
}
} else {
// r start after prev ends add a new range
to = append(to, r)
prev = r
}
} else {
to = append(to, r)
prev = r
}
}
return to
}
// Expects p.combineOverlaps() to have been run on orig s.t. there are no
// overlapping ranges, the ranges are sorted, and all ranges are separated by at
// least one character.
func invertRanges(orig []*Range) []*Range {
if len(orig) <= 0 {
return []*Range{NewAny()}
}
invrt := make([]*Range, 0, len(orig)+1)
if orig[0].From > 0 {
invrt = append(invrt, NewRange(0, orig[0].From-1))
}
for i := 0; i+1 < len(orig); i++ {
if orig[i].To < 255 {
invrt = append(invrt, NewRange(orig[i].To+1, orig[i+1].From-1))
}
}
if orig[len(orig)-1].To < 255 {
invrt = append(invrt, NewRange(orig[len(orig)-1].To+1, 255))
}
return invrt
}
// Expects p.combineOverlaps() to have been run on orig s.t. there are no
// overlapping ranges, the ranges are sorted, and all ranges are separated by at
// least one character.
func rangesToAST(ranges []*Range) AST {
if len(ranges) == 0 {
panic("no ranges")
} else if len(ranges) == 1 {
return ranges[0]
}
ast := NewAlternation(
ranges[len(ranges)-2],
ranges[len(ranges)-1],
)
for j := len(ranges) - 3; j >= 0; j-- {
ast = NewAlternation(
ranges[j],
ast,
)
}
return ast
}
func (p *parser) matchAny(i int) (int, *Character, *ParseError) {
if i >= len(p.text) {
return i, nil, Errorf(p.text, i, "out of p.text, %d", i)
}
return i + 1, NewCharacter(p.text[i]), nil
}
func (p *parser) match(i int, c byte) (int, *ParseError) {
if i >= len(p.text) {
return i, matchErrorf(p.text, i, "out of p.text, %d", i)
} else if p.text[i] == c {
i++
return i, nil
}
return i, matchErrorf(p.text, i,
"expected '%v' at %v got '%v' of '%v'",
string([]byte{c}),
i,
string(p.text[i:i+1]),
string(p.text[i:]),
)
}