From d2362be35b2744b5f6a8b1029d2a64a85c903184 Mon Sep 17 00:00:00 2001 From: zhouzhihong Date: Thu, 25 Aug 2022 21:49:42 +0800 Subject: [PATCH 1/2] Update token string func. --- lexer.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lexer.go b/lexer.go index 939b23b..b32f844 100644 --- a/lexer.go +++ b/lexer.go @@ -55,7 +55,20 @@ func (t *Token) Equals(other *Token) bool { // String formats the token in a human readable form. func (t *Token) String() string { - return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn) + return fmt.Sprintf( + "%d %q %d (%d, %d)-(%d, %d) (%d, %d)-(%d, %d)", + t.Type, + t.Value, + t.TC, + t.StartLine, + t.StartColumn, + t.EndLine, + t.EndColumn, + t.TSLine, + t.TSColumn, + t.TELine, + t.TEColumn, + ) } // An Action is a function which get called when the Scanner finds a match From 7d2133ac251ac0f864f92602472407f343c09852 Mon Sep 17 00:00:00 2001 From: zhouzhihong Date: Fri, 26 Aug 2022 14:48:17 +0800 Subject: [PATCH 2/2] Update doc --- README.md | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ doc.go | 115 ------------------------------------- 2 files changed, 166 insertions(+), 115 deletions(-) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..a16d966 --- /dev/null +++ b/README.md @@ -0,0 +1,166 @@ +## Lexmachine +词法分析词,基于开源项目改造实现,主要是增加对于utf8文本的行数和列数的计算 + +### 快速开始 + +#### 安装 +``` +go get gitea.xintech.co/zhouzhihong/lexmachine +``` + +#### 使用列子 +``` +import ( + "fmt" + "strings" + + lex "gitea.xintech.co/zhouzhihong/lexmachine" + "gitea.xintech.co/zhouzhihong/lexmachine/machines" +) + +var Literals []string // The tokens representing literal strings +var Keywords []string // The keyword tokens +var Tokens []string // All of the tokens (including literals and keywords) +var TokenIds map[string]int // A map from the token names to their int ids +var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner + +// Called at package initialization. Creates the lexer and populates token lists. +func init() { + initTokens() + var err error + Lexer, err = initLexer() + if err != nil { + panic(err) + } +} + +func initTokens() { + Literals = []string{ + "[", + "]", + "{", + "}", + "=", + ",", + ";", + ":", + "->", + "--", + } + Keywords = []string{ + "NODE", + "EDGE", + "GRAPH", + "DIGRAPH", + "SUBGRAPH", + "STRICT", + } + Tokens = []string{ + "COMMENT", + "ID", + } + Tokens = append(Tokens, Keywords...) + Tokens = append(Tokens, Literals...) + TokenIds = make(map[string]int) + for i, tok := range Tokens { + TokenIds[tok] = i + } +} + +// Creates the lexer object and compiles the NFA. +func initLexer() (*lex.Lexer, error) { + lexer := lex.NewLexer() + + for _, lit := range Literals { + r := "\\" + strings.Join(strings.Split(lit, ""), "\\") + lexer.Add([]byte(r), token(lit)) + } + for _, name := range Keywords { + lexer.Add([]byte(strings.ToLower(name)), token(name)) + } + + lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT")) + lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT")) + lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID")) + lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID")) + lexer.Add([]byte(`"([^\\"]|(\\.))*"`), + func(scan *lex.Scanner, match *machines.Match) (interface{}, error) { + x, _ := token("ID")(scan, match) + t := x.(*lex.Token) + v := t.Value.(string) + t.Value = v[1 : len(v)-1] + return t, nil + }) + lexer.Add([]byte("( |\t|\n|\r)+"), skip) + lexer.Add([]byte(`\<`), + func(scan *lex.Scanner, match *machines.Match) (interface{}, error) { + str := make([]byte, 0, 10) + str = append(str, match.Bytes...) + brackets := 1 + match.EndLine = match.StartLine + match.EndColumn = match.StartColumn + for tc := scan.TC; tc < len(scan.Text); tc++ { + str = append(str, scan.Text[tc]) + match.EndColumn += 1 + if scan.Text[tc] == '\n' { + match.EndLine += 1 + } + if scan.Text[tc] == '<' { + brackets += 1 + } else if scan.Text[tc] == '>' { + brackets -= 1 + } + if brackets == 0 { + match.TC = scan.TC + scan.TC = tc + 1 + match.Bytes = str + x, _ := token("ID")(scan, match) + t := x.(*lex.Token) + v := t.Value.(string) + t.Value = v[1 : len(v)-1] + return t, nil + } + } + return nil, + fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)", + match.TC, match.StartLine, match.StartColumn) + }, + ) + + err := lexer.Compile() + if err != nil { + return nil, err + } + return lexer, nil +} + +// a lex.Action function which skips the match. +func skip(*lex.Scanner, *machines.Match) (interface{}, error) { + return nil, nil +} + +// a lex.Action function with constructs a Token of the given token type by +// the token type's name. +func token(name string) lex.Action { + return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { + return s.Token(TokenIds[name], string(m.Bytes), m), nil + } +} + +s, _ := dot.Lexer.Scanner([]byte(` + digraph G { a -> b; } <>asdf>x "asdf\\\\\"" // asdf + strict // asdfa asfwe +/*你好*/ DIGRAPH // asdf`)) + +for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() { + fmt.Println(tok, err) + fmt.Printf("(%v,%v)-(%v,%v)\n", + tok.(*lexmachine.Token).TSLine, + tok.(*lexmachine.Token).TSColumn, + tok.(*lexmachine.Token).TELine, + tok.(*lexmachine.Token).TEColumn, + ) +} +``` + +// TODO 优化位置标记计算 \ No newline at end of file diff --git a/doc.go b/doc.go index efe744d..753de52 100644 --- a/doc.go +++ b/doc.go @@ -1,116 +1 @@ -// Package lexmachine is a full lexical analysis framework for the Go -// programming language. It supports a restricted but usable set of regular -// expressions appropriate for writing lexers for complex programming -// languages. The framework also supports sub-lexers and non-regular lexing -// through an "escape hatch" which allows the users to consume any number of -// further bytes after a match. So if you want to support nested C-style -// comments or other paired structures you can do so at the lexical analysis -// stage. -// -// For a tutorial see -// http://hackthology.com/writing-a-lexer-in-go-with-lexmachine.html -// -// Example of defining a lexer -// -// // CreateLexer defines a lexer for the graphviz dot language. -// func CreateLexer() (*lexmachine.Lexer, error) { -// lexer := lexmachine.NewLexer() -// -// for _, lit := range Literals { -// r := "\\" + strings.Join(strings.Split(lit, ""), "\\") -// lexer.Add([]byte(r), token(lit)) -// } -// for _, name := range Keywords { -// lexer.Add([]byte(strings.ToLower(name)), token(name)) -// } -// -// lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT")) -// lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT")) -// lexer.Add([]byte(`([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*`), token("ID")) -// lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("ID")) -// lexer.Add([]byte("( |\t|\n|\r)+"), skip) -// lexer.Add([]byte(`\<`), -// func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) { -// str := make([]byte, 0, 10) -// str = append(str, match.Bytes...) -// brackets := 1 -// match.EndLine = match.StartLine -// match.EndColumn = match.StartColumn -// for tc := scan.TC; tc < len(scan.Text); tc++ { -// str = append(str, scan.Text[tc]) -// match.EndColumn += 1 -// if scan.Text[tc] == '\n' { -// match.EndLine += 1 -// } -// if scan.Text[tc] == '<' { -// brackets += 1 -// } else if scan.Text[tc] == '>' { -// brackets -= 1 -// } -// if brackets == 0 { -// match.TC = scan.TC -// scan.TC = tc + 1 -// match.Bytes = str -// return token("ID")(scan, match) -// } -// } -// return nil, -// fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)", -// match.TC, match.StartLine, match.StartColumn) -// }, -// ) -// -// err := lexer.Compile() -// if err != nil { -// return nil, err -// } -// return lexer, nil -// } -// -// func token(name string) lex.Action { -// return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { -// return s.Token(TokenIds[name], string(m.Bytes), m), nil -// } -// } -// -// Example of using a lexer -// -// func ExampleLex() error { -// lexer, err := CreateLexer() -// if err != nil { -// return err -// } -// scanner, err := lexer.Scanner([]byte(`digraph { -// rankdir=LR; -// a [label="a" shape=box]; -// c [