Merge remote-tracking branch 'origin/dev'

Merge branch 'dev' of ssh://gitea.xintech.co:2222/zhouzhihong/lexmachine into dev
Update doc
2022-08-28 22:51:25 +08:00 · 2022-08-26 14:49:21 +08:00 · 2022-08-26 14:48:17 +08:00 · 2022-08-25 21:49:42 +08:00
3 changed files with 180 additions and 116 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,166 @@
+## Lexmachine
+词法分析词，基于开源项目改造实现，主要是增加对于utf8文本的行数和列数的计算
+
+### 快速开始
+
+#### 安装
+```
+go get gitea.xintech.co/zhouzhihong/lexmachine
+```
+
+#### 使用列子
+```
+import (
+	"fmt"
+	"strings"
+
+	lex "gitea.xintech.co/zhouzhihong/lexmachine"
+	"gitea.xintech.co/zhouzhihong/lexmachine/machines"
+)
+
+var Literals []string       // The tokens representing literal strings
+var Keywords []string       // The keyword tokens
+var Tokens []string         // All of the tokens (including literals and keywords)
+var TokenIds map[string]int // A map from the token names to their int ids
+var Lexer *lex.Lexer        // The lexer object. Use this to construct a Scanner
+
+// Called at package initialization. Creates the lexer and populates token lists.
+func init() {
+	initTokens()
+	var err error
+	Lexer, err = initLexer()
+	if err != nil {
+		panic(err)
+	}
+}
+
+func initTokens() {
+	Literals = []string{
+		"[",
+		"]",
+		"{",
+		"}",
+		"=",
+		",",
+		";",
+		":",
+		"->",
+		"--",
+	}
+	Keywords = []string{
+		"NODE",
+		"EDGE",
+		"GRAPH",
+		"DIGRAPH",
+		"SUBGRAPH",
+		"STRICT",
+	}
+	Tokens = []string{
+		"COMMENT",
+		"ID",
+	}
+	Tokens = append(Tokens, Keywords...)
+	Tokens = append(Tokens, Literals...)
+	TokenIds = make(map[string]int)
+	for i, tok := range Tokens {
+		TokenIds[tok] = i
+	}
+}
+
+// Creates the lexer object and compiles the NFA.
+func initLexer() (*lex.Lexer, error) {
+	lexer := lex.NewLexer()
+
+	for _, lit := range Literals {
+		r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
+		lexer.Add([]byte(r), token(lit))
+	}
+	for _, name := range Keywords {
+		lexer.Add([]byte(strings.ToLower(name)), token(name))
+	}
+
+	lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
+	lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
+	lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID"))
+	lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID"))
+	lexer.Add([]byte(`"([^\\"]|(\\.))*"`),
+		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
+			x, _ := token("ID")(scan, match)
+			t := x.(*lex.Token)
+			v := t.Value.(string)
+			t.Value = v[1 : len(v)-1]
+			return t, nil
+		})
+	lexer.Add([]byte("( |\t|\n|\r)+"), skip)
+	lexer.Add([]byte(`\<`),
+		func(scan *lex.Scanner, match *machines.Match) (interface{}, error) {
+			str := make([]byte, 0, 10)
+			str = append(str, match.Bytes...)
+			brackets := 1
+			match.EndLine = match.StartLine
+			match.EndColumn = match.StartColumn
+			for tc := scan.TC; tc < len(scan.Text); tc++ {
+				str = append(str, scan.Text[tc])
+				match.EndColumn += 1
+				if scan.Text[tc] == '\n' {
+					match.EndLine += 1
+				}
+				if scan.Text[tc] == '<' {
+					brackets += 1
+				} else if scan.Text[tc] == '>' {
+					brackets -= 1
+				}
+				if brackets == 0 {
+					match.TC = scan.TC
+					scan.TC = tc + 1
+					match.Bytes = str
+					x, _ := token("ID")(scan, match)
+					t := x.(*lex.Token)
+					v := t.Value.(string)
+					t.Value = v[1 : len(v)-1]
+					return t, nil
+				}
+			}
+			return nil,
+				fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
+					match.TC, match.StartLine, match.StartColumn)
+		},
+	)
+
+	err := lexer.Compile()
+	if err != nil {
+		return nil, err
+	}
+	return lexer, nil
+}
+
+// a lex.Action function which skips the match.
+func skip(*lex.Scanner, *machines.Match) (interface{}, error) {
+	return nil, nil
+}
+
+// a lex.Action function with constructs a Token of the given token type by
+// the token type's name.
+func token(name string) lex.Action {
+	return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
+		return s.Token(TokenIds[name], string(m.Bytes), m), nil
+	}
+}
+
+s, _ := dot.Lexer.Scanner([]byte(`
+	digraph G { a -> b; } <asfd<asdf><a><>asdf>x "asdf\\\\\"" // asdf
+	strict // asdfa asfwe
+/*你好*/ DIGRAPH // asdf`))
+
+for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() {
+    fmt.Println(tok, err)
+    fmt.Printf("(%v,%v)-(%v,%v)\n",
+        tok.(*lexmachine.Token).TSLine,
+        tok.(*lexmachine.Token).TSColumn,
+        tok.(*lexmachine.Token).TELine,
+        tok.(*lexmachine.Token).TEColumn,
+    )
+}
+```
+
+// TODO 优化位置标记计算
--- a/doc.go
+++ b/doc.go
@ -1,116 +1 @@
-// Package lexmachine is a full lexical analysis framework for the Go
-// programming language. It supports a restricted but usable set of regular
-// expressions appropriate for writing lexers for complex programming
-// languages. The framework also supports sub-lexers and non-regular lexing
-// through an "escape hatch" which allows the users to consume any number of
-// further bytes after a match. So if you want to support nested C-style
-// comments or other paired structures you can do so at the lexical analysis
-// stage.
-//
-// For a tutorial see
-// http://hackthology.com/writing-a-lexer-in-go-with-lexmachine.html
-//
-// Example of defining a lexer
-//
-//     // CreateLexer defines a lexer for the graphviz dot language.
-//     func CreateLexer() (*lexmachine.Lexer, error) {
-//         lexer := lexmachine.NewLexer()
-//
-//         for _, lit := range Literals {
-//             r := "\\" + strings.Join(strings.Split(lit, ""), "\\")
-//             lexer.Add([]byte(r), token(lit))
-//         }
-//         for _, name := range Keywords {
-//             lexer.Add([]byte(strings.ToLower(name)), token(name))
-//         }
-//
-//         lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT"))
-//         lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT"))
-//         lexer.Add([]byte(`([a-z]|[A-Z])([a-z]|[A-Z]|[0-9]|_)*`), token("ID"))
-//         lexer.Add([]byte(`"([^\\"]|(\\.))*"`), token("ID"))
-//         lexer.Add([]byte("( |\t|\n|\r)+"), skip)
-//         lexer.Add([]byte(`\<`),
-//             func(scan *lexmachine.Scanner, match *machines.Match) (interface{}, error) {
-//                 str := make([]byte, 0, 10)
-//                 str = append(str, match.Bytes...)
-//                 brackets := 1
-//                 match.EndLine = match.StartLine
-//                 match.EndColumn = match.StartColumn
-//                 for tc := scan.TC; tc < len(scan.Text); tc++ {
-//                     str = append(str, scan.Text[tc])
-//                     match.EndColumn += 1
-//                     if scan.Text[tc] == '\n' {
-//                         match.EndLine += 1
-//                     }
-//                     if scan.Text[tc] == '<' {
-//                         brackets += 1
-//                     } else if scan.Text[tc] == '>' {
-//                         brackets -= 1
-//                     }
-//                     if brackets == 0 {
-//                         match.TC = scan.TC
-//                         scan.TC = tc + 1
-//                         match.Bytes = str
-//                         return token("ID")(scan, match)
-//                     }
-//                 }
-//                 return nil,
-//                     fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)",
-//                         match.TC, match.StartLine, match.StartColumn)
-//             },
-//         )
-//
-//         err := lexer.Compile()
-//         if err != nil {
-//             return nil, err
-//         }
-//         return lexer, nil
-//     }
-//
-//     func token(name string) lex.Action {
-//         return func(s *lex.Scanner, m *machines.Match) (interface{}, error) {
-//             return s.Token(TokenIds[name], string(m.Bytes), m), nil
-//         }
-//     }
-//
-// Example of using a lexer
-//
-//     func ExampleLex() error {
-//         lexer, err := CreateLexer()
-//         if err != nil {
-//             return err
-//         }
-//         scanner, err := lexer.Scanner([]byte(`digraph {
-//           rankdir=LR;
-//           a [label="a" shape=box];
-//           c [<label>=<<u>C</u>>];
-//           b [label="bb"];
-//           a -> c;
-//           c -> b;
-//           d -> c;
-//           b -> a;
-//           b -> e;
-//           e -> f;
-//         }`))
-//         if err != nil {
-//             return err
-//         }
-//         fmt.Println("Type    | Lexeme     | Position")
-//         fmt.Println("--------+------------+------------")
-//         for tok, err, eos := scanner.Next(); !eos; tok, err, eos = scanner.Next() {
-//             if err != nil {
-//                 return err
-//             }
-//             token := tok.(*lexmachine.Token)
-//             fmt.Printf("%-7v | %-10v | %v:%v-%v:%v\n",
-//                 dot.Tokens[token.Type],
-//                 string(token.Lexeme),
-//                 token.StartLine,
-//                 token.StartColumn,
-//                 token.EndLine,
-//                 token.EndColumn)
-//         }
-//         return nil
-//     }
-//
 package lexmachine
--- a/lexer.go
+++ b/lexer.go
@ -53,7 +53,20 @@ func (t *Token) Equals(other *Token) bool {

 // String formats the token in a human readable form.
 func (t *Token) String() string {
-	return fmt.Sprintf("%d %q %d (%d, %d)-(%d, %d)", t.Type, t.Value, t.TC, t.StartLine, t.StartColumn, t.EndLine, t.EndColumn)
+	return fmt.Sprintf(
+		"%d %q %d (%d, %d)-(%d, %d) (%d, %d)-(%d, %d)",
+		t.Type,
+		t.Value,
+		t.TC,
+		t.StartLine,
+		t.StartColumn,
+		t.EndLine,
+		t.EndColumn,
+		t.TSLine,
+		t.TSColumn,
+		t.TELine,
+		t.TEColumn,
+	)
 }

 // An Action is a function which get called when the Scanner finds a match
Author	SHA1	Message	Date
zhouzhihong	62b949dd83	Merge remote-tracking branch 'origin/dev'	2022-08-28 22:51:25 +08:00
zhouzhihong	95e26e7aae	Merge branch 'dev' of ssh://gitea.xintech.co:2222/zhouzhihong/lexmachine into dev	2022-08-26 14:49:21 +08:00
zhouzhihong	7d2133ac25	Update doc	2022-08-26 14:48:17 +08:00
zhouzhihong	d2362be35b	Update token string func.	2022-08-25 21:49:42 +08:00