Merge remote-tracking branch 'origin/dev'

2022-08-28 22:51:25 +08:00 · 2022-08-28 22:51:25 +08:00 · 62b949dd83
commit 62b949dd83
parent ef4073251a 95e26e7aae
3 changed files with 180 additions and 116 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,166 @@
 ## Lexmachine
 词法分析词，基于开源项目改造实现，主要是增加对于utf8文本的行数和列数的计算
 ### 快速开始
 #### 安装
 ```
 go get gitea.xintech.co/zhouzhihong/lexmachine
 ```
 #### 使用列子
 ```
 import (
 	"fmt"
 	"strings"
 	lex "gitea.xintech.co/zhouzhihong/lexmachine"
 	"gitea.xintech.co/zhouzhihong/lexmachine/machines"
 )
 var Literals []string       // The tokens representing literal strings
 var Keywords []string       // The keyword tokens
 var Tokens []string         // All of the tokens (including literals and keywords)
 var TokenIds map[string]int // A map from the token names to their int ids
 var Lexer *lex.Lexer        // The lexer object. Use this to construct a Scanner
 // Called at package initialization. Creates the lexer and populates token lists.
 func init() {
 	initTokens()
 	var err error
 	Lexer, err = initLexer()
 	if err != nil {
 		panic(err)
 	}
 }
 func initTokens() {
 	Literals = []string{
 		"[",
 		"]",
 		"{",
 		"}",
 		"=",
 		",",
 		";",
 		":",
 		"->",
 		"--",
 	}
 	Keywords = []string{
 		"NODE",
 		"EDGE",
 		"GRAPH",
 		"DIGRAPH",
 		"SUBGRAPH",
 		"STRICT",
 	}
 	Tokens = []string{
 		"COMMENT",
 		"ID",
 	}
 	Tokens = append(Tokens, Keywords...)
 	Tokens = append(Tokens, Literals...)
 	TokenIds = make(map[string]int)
 	for i, tok := range Tokens {
 		TokenIds[tok] = i
 	}
 }
 // Creates the lexer object and compiles the NFA.
 func initLexer() (*lex.Lexer, error) {
 	lexer := lex.NewLexer()
 	for _, lit := range Literals {
 		r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
 		lexer.Add([]byte(r), token(lit))
 	}
 	for _, name := range Keywords {
 		lexer.Add([]byte(strings.ToLower(name)), token(name))
 	}
 	lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
 	lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
 	lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
 	lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
 	lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
 		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
 			x, _ := token("ID")(scan, match)
 			t := x.(*lex.Token)
 			v := t.Value.(string)
 			t.Value = v[1 : len(v)-1]
 			return t, nil
 		})
 	lexer.Add([]byte("( |\t|\n|\r)+"), skip)
 	lexer.Add([]byte(`\<`),
 		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
 			str := make([]byte, 0, 10)
 			str = append(str, match.Bytes...)
 			brackets := 1
 			match.EndLine = match.StartLine
 			match.EndColumn = match.StartColumn
 			for tc := scan.TC; tc < len(scan.Text); tc++ {
 				str = append(str, scan.Text[tc])
 				match.EndColumn += 1
 				if scan.Text[tc] == '\n' {
 					match.EndLine += 1
 				}
 				if scan.Text[tc] == '<' {
 					brackets += 1
 				} else if scan.Text[tc] == '>' {
 					brackets -= 1
 				}
 				if brackets == 0 {
 					match.TC = scan.TC
 					scan.TC = tc + 1
 					match.Bytes = str
 					x, _ := token("ID")(scan, match)
 					t := x.(*lex.Token)
 					v := t.Value.(string)
 					t.Value = v[1 : len(v)-1]
 					return t, nil
 				}
 			}
 			return nil,
 				fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
 					match.TC, match.StartLine, match.StartColumn)
 		},
 	)
 	err := lexer.Compile()
 	if err != nil {
 		return nil, err
 	}
 	return lexer, nil
 }
 // a lex.Action function which skips the match.
 func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
 	return nil, nil
 }
 // a lex.Action function with constructs a Token of the given token type by
 // the token type's name.
 func token(name string) lex.Action {
 	return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
 		return s.Token(TokenIds[name], string(m.Bytes), m), nil
 	}
 }
 s, _ := dot.Lexer.Scanner([]byte(`
 	digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
 	strict // asdfa asfwe
 /*你好*/ DIGRAPH // asdf`))
 for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
    fmt.Println(tok, err)
    fmt.Printf("(%v,%v)-(%v,%v)\n",
        tok.(*lexmachine.Token).TSLine,
        tok.(*lexmachine.Token).TSColumn,
        tok.(*lexmachine.Token).TELine,
        tok.(*lexmachine.Token).TEColumn,
    )
 }
 ```
 // TODO 优化位置标记计算
--- a/doc.go
+++ b/doc.go
@ -1,116 +1 @@
 // Package lexmachine is a full lexical analysis framework for the Go
 // programming language. It supports a restricted but usable set of regular
 // expressions appropriate for writing lexers for complex programming
 // languages. The framework also supports sub-lexers and non-regular lexing
 // through an "escape hatch" which allows the users to consume any number of
 // further bytes after a match. So if you want to support nested C-style
 // comments or other paired structures you can do so at the lexical analysis
 // stage.
 //
 // For a tutorial see
 // http://hackthology.com/writing-a-lexer-in-go-with-lexmachine.html
 //
 // Example of defining a lexer
 //
 //     // CreateLexer defines a lexer for the graphviz dot language.
 //     func CreateLexer() (*lexmachine.Lexer, error) {
 //         lexer := lexmachine.NewLexer()
 //
 //         for _, lit := range Literals {
 //             r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
 //             lexer.Add([]byte(r), token(lit))
 //         }
 //         for _, name := range Keywords {
 //             lexer.Add([]byte(strings.ToLower(name)), token(name))
 //         }
 //
 //         lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
 //         lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
 //         lexer.Add([]byte(`([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*`), token("ID"))
 //         lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("ID"))
 //         lexer.Add([]byte("( |\t|\n|\r)+"), skip)
 //         lexer.Add([]byte(`\<`),
 //             func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) {
 //                 str := make([]byte, 0, 10)
 //                 str = append(str, match.Bytes...)
 //                 brackets := 1
 //                 match.EndLine = match.StartLine
 //                 match.EndColumn = match.StartColumn
 //                 for tc := scan.TC; tc < len(scan.Text); tc++ {
 //                     str = append(str, scan.Text[tc])
 //                     match.EndColumn += 1
 //                     if scan.Text[tc] == '\n' {
 //                         match.EndLine += 1
 //                     }
 //                     if scan.Text[tc] == '<' {
 //                         brackets += 1
 //                     } else if scan.Text[tc] == '>' {
 //                         brackets -= 1
 //                     }
 //                     if brackets == 0 {
 //                         match.TC = scan.TC
 //                         scan.TC = tc + 1
 //                         match.Bytes = str
 //                         return token("ID")(scan, match)
 //                     }
 //                 }
 //                 return nil,
 //                     fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
 //                         match.TC, match.StartLine, match.StartColumn)
 //             },
 //         )
 //
 //         err := lexer.Compile()
 //         if err != nil {
 //             return nil, err
 //         }
 //         return lexer, nil
 //     }
 //
 //     func token(name string) lex.Action {
 //         return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
 //             return s.Token(TokenIds[name], string(m.Bytes), m), nil
 //         }
 //     }
 //
 // Example of using a lexer
 //
 //     func ExampleLex() error {
 //         lexer, err := CreateLexer()
 //         if err != nil {
 //             return err
 //         }
 //         scanner, err := lexer.Scanner([]byte(`digraph {
 //           rankdir=LR;
 //           a [label="a" shape=box];
 //           c [<label>=<<u>C</u>>];
 //           b [label="bb"];
 //           a -> c;
 //           c -> b;
 //           d -> c;
 //           b -> a;
 //           b -> e;
 //           e -> f;
 //         }`))
 //         if err != nil {
 //             return err
 //         }
 //         fmt.Println("Type    | Lexeme     | Position")
 //         fmt.Println("--------+------------+------------")
 //         for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() {
 //             if err != nil {
 //                 return err
 //             }
 //             token := tok.(*lexmachine.Token)
 //             fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n",
 //                 dot.Tokens[token.Type],
 //                 string(token.Lexeme),
 //                 token.StartLine,
 //                 token.StartColumn,
 //                 token.EndLine,
 //                 token.EndColumn)
 //         }
 //         return nil
 //     }
 //
 package lexmachine
--- a/lexer.go
+++ b/lexer.go
@ -53,7 +53,20 @@ func (t *Token) Equals(other *Token) bool {
 // String formats the token in a human readable form.
 func (t *Token) String() string {
-	return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn)
+	return fmt.Sprintf(
 		"%d %q %d (%d, %d)-(%d, %d) (%d, %d)-(%d, %d)",
 		t.Type,
 		t.Value,
 		t.TC,
 		t.StartLine,
 		t.StartColumn,
 		t.EndLine,
 		t.EndColumn,
 		t.TSLine,
 		t.TSColumn,
 		t.TELine,
 		t.TEColumn,
 	)
 }
 // An Action is a function which get called when the Scanner finds a match