lexmachine

词法分词器框架

Go to file

zhouzhihong 7d2133ac25 Update doc		2022-08-26 14:48:17 +08:00
cmd	Revert code	2022-08-25 10:49:30 +08:00
dfa	Try to fixed postion of token for unicode.	2022-08-24 23:25:24 +08:00
examples	Try to fixed postion of token for unicode.	2022-08-24 23:25:24 +08:00
frontend	Add text line and text column calculator.	2022-08-25 14:33:22 +08:00
inst	Init project	2022-08-24 15:59:27 +08:00
lexc	Try to fixed postion of token for unicode.	2022-08-24 23:25:24 +08:00
machines	Add text line and text column calculator.	2022-08-25 14:33:22 +08:00
queue	Init project	2022-08-24 15:59:27 +08:00
.activate	Init project	2022-08-24 15:59:27 +08:00
.gitignore	Init project	2022-08-24 15:59:27 +08:00
.gitmodules	Init project	2022-08-24 15:59:27 +08:00
doc.go	Update doc	2022-08-26 14:48:17 +08:00
go.mod	Init project	2022-08-24 15:59:27 +08:00
go.sum	Init project	2022-08-24 15:59:27 +08:00
grammar	Try to fixed postion of token for unicode.	2022-08-24 23:25:24 +08:00
lexer_test.go	Add field for token keep line and column.	2022-08-25 15:06:55 +08:00
lexer.go	Add field for token keep line and column.	2022-08-25 15:06:55 +08:00
README.md	Update doc	2022-08-26 14:48:17 +08:00

README.md

Lexmachine

词法分析词，基于开源项目改造实现，主要是增加对于utf8文本的行数和列数的计算

快速开始

安装

go get gitea.xintech.co/zhouzhihong/lexmachine

使用列子

import (
	"fmt"
	"strings"

	lex "gitea.xintech.co/zhouzhihong/lexmachine"
	"gitea.xintech.co/zhouzhihong/lexmachine/machines"
)

var Literals []string       // The tokens representing literal strings
var Keywords []string       // The keyword tokens
var Tokens []string         // All of the tokens (including literals and keywords)
var TokenIds map[string]int // A map from the token names to their int ids
var Lexer *lex.Lexer        // The lexer object. Use this to construct a Scanner

// Called at package initialization. Creates the lexer and populates token lists.
func init() {
	initTokens()
	var err error
	Lexer, err = initLexer()
	if err != nil {
		panic(err)
	}
}

func initTokens() {
	Literals = []string{
		"[",
		"]",
		"{",
		"}",
		"=",
		",",
		";",
		":",
		"->",
		"--",
	}
	Keywords = []string{
		"NODE",
		"EDGE",
		"GRAPH",
		"DIGRAPH",
		"SUBGRAPH",
		"STRICT",
	}
	Tokens = []string{
		"COMMENT",
		"ID",
	}
	Tokens = append(Tokens, Keywords...)
	Tokens = append(Tokens, Literals...)
	TokenIds = make(map[string]int)
	for i, tok := range Tokens {
		TokenIds[tok] = i
	}
}

// Creates the lexer object and compiles the NFA.
func initLexer() (*lex.Lexer, error) {
	lexer := lex.NewLexer()

	for _, lit := range Literals {
		r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
		lexer.Add([]byte(r), token(lit))
	}
	for _, name := range Keywords {
		lexer.Add([]byte(strings.ToLower(name)), token(name))
	}

	lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
	lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
	lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
	lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
	lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
			x, _ := token("ID")(scan, match)
			t := x.(*lex.Token)
			v := t.Value.(string)
			t.Value = v[1 : len(v)-1]
			return t, nil
		})
	lexer.Add([]byte("( |\t|\n|\r)+"), skip)
	lexer.Add([]byte(`\<`),
		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
			str := make([]byte, 0, 10)
			str = append(str, match.Bytes...)
			brackets := 1
			match.EndLine = match.StartLine
			match.EndColumn = match.StartColumn
			for tc := scan.TC; tc < len(scan.Text); tc++ {
				str = append(str, scan.Text[tc])
				match.EndColumn += 1
				if scan.Text[tc] == '\n' {
					match.EndLine += 1
				}
				if scan.Text[tc] == '<' {
					brackets += 1
				} else if scan.Text[tc] == '>' {
					brackets -= 1
				}
				if brackets == 0 {
					match.TC = scan.TC
					scan.TC = tc + 1
					match.Bytes = str
					x, _ := token("ID")(scan, match)
					t := x.(*lex.Token)
					v := t.Value.(string)
					t.Value = v[1 : len(v)-1]
					return t, nil
				}
			}
			return nil,
				fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
					match.TC, match.StartLine, match.StartColumn)
		},
	)

	err := lexer.Compile()
	if err != nil {
		return nil, err
	}
	return lexer, nil
}

// a lex.Action function which skips the match.
func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
	return nil, nil
}

// a lex.Action function with constructs a Token of the given token type by
// the token type's name.
func token(name string) lex.Action {
	return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
		return s.Token(TokenIds[name], string(m.Bytes), m), nil
	}
}

s, _ := dot.Lexer.Scanner([]byte(`
	digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
	strict // asdfa asfwe
/*你好*/ DIGRAPH // asdf`))

for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
    fmt.Println(tok, err)
    fmt.Printf("(%v,%v)-(%v,%v)\n",
        tok.(*lexmachine.Token).TSLine,
        tok.(*lexmachine.Token).TSColumn,
        tok.(*lexmachine.Token).TELine,
        tok.(*lexmachine.Token).TEColumn,
    )
}

// TODO 优化位置标记计算

README.md Unescape Escape

Lexmachine

快速开始

安装

使用列子

README.md