## Lexmachine 词法分析词,基于开源项目改造实现,主要是增加对于utf8文本的行数和列数的计算 ### 快速开始 #### 安装 ``` go get gitea.xintech.co/zhouzhihong/lexmachine ``` #### 使用列子 ``` import ( "fmt" "strings" lex "gitea.xintech.co/zhouzhihong/lexmachine" "gitea.xintech.co/zhouzhihong/lexmachine/machines" ) var Literals []string // The tokens representing literal strings var Keywords []string // The keyword tokens var Tokens []string // All of the tokens (including literals and keywords) var TokenIds map[string]int // A map from the token names to their int ids var Lexer *lex.Lexer // The lexer object. Use this to construct a Scanner // Called at package initialization. Creates the lexer and populates token lists. func init() { initTokens() var err error Lexer, err = initLexer() if err != nil { panic(err) } } func initTokens() { Literals = []string{ "[", "]", "{", "}", "=", ",", ";", ":", "->", "--", } Keywords = []string{ "NODE", "EDGE", "GRAPH", "DIGRAPH", "SUBGRAPH", "STRICT", } Tokens = []string{ "COMMENT", "ID", } Tokens = append(Tokens, Keywords...) Tokens = append(Tokens, Literals...) TokenIds = make(map[string]int) for i, tok := range Tokens { TokenIds[tok] = i } } // Creates the lexer object and compiles the NFA. func initLexer() (*lex.Lexer, error) { lexer := lex.NewLexer() for _, lit := range Literals { r := "\\" + strings.Join(strings.Split(lit, ""), "\\") lexer.Add([]byte(r), token(lit)) } for _, name := range Keywords { lexer.Add([]byte(strings.ToLower(name)), token(name)) } lexer.Add([]byte(`//[^\n]*\n?`), token("COMMENT")) lexer.Add([]byte(`/\*([^*]|\r|\n|(\*+([^*/]|\r|\n)))*\*+/`), token("COMMENT")) lexer.Add([]byte(`([a-z]|[A-Z]|[0-9]|_)+`), token("ID")) lexer.Add([]byte(`[0-9]*\.[0-9]+`), token("ID")) lexer.Add([]byte(`"([^\\"]|(\\.))*"`), func(scan *lex.Scanner, match *machines.Match) (interface{}, error) { x, _ := token("ID")(scan, match) t := x.(*lex.Token) v := t.Value.(string) t.Value = v[1 : len(v)-1] return t, nil }) lexer.Add([]byte("( |\t|\n|\r)+"), skip) lexer.Add([]byte(`\<`), func(scan *lex.Scanner, match *machines.Match) (interface{}, error) { str := make([]byte, 0, 10) str = append(str, match.Bytes...) brackets := 1 match.EndLine = match.StartLine match.EndColumn = match.StartColumn for tc := scan.TC; tc < len(scan.Text); tc++ { str = append(str, scan.Text[tc]) match.EndColumn += 1 if scan.Text[tc] == '\n' { match.EndLine += 1 } if scan.Text[tc] == '<' { brackets += 1 } else if scan.Text[tc] == '>' { brackets -= 1 } if brackets == 0 { match.TC = scan.TC scan.TC = tc + 1 match.Bytes = str x, _ := token("ID")(scan, match) t := x.(*lex.Token) v := t.Value.(string) t.Value = v[1 : len(v)-1] return t, nil } } return nil, fmt.Errorf("unclosed HTML literal starting at %d, (%d, %d)", match.TC, match.StartLine, match.StartColumn) }, ) err := lexer.Compile() if err != nil { return nil, err } return lexer, nil } // a lex.Action function which skips the match. func skip(*lex.Scanner, *machines.Match) (interface{}, error) { return nil, nil } // a lex.Action function with constructs a Token of the given token type by // the token type's name. func token(name string) lex.Action { return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { return s.Token(TokenIds[name], string(m.Bytes), m), nil } } s, _ := dot.Lexer.Scanner([]byte(` digraph G { a -> b; } <>asdf>x "asdf\\\\\"" // asdf strict // asdfa asfwe /*你好*/ DIGRAPH // asdf`)) for tok, err, eof := s.Next(); !eof; tok, err, eof = s.Next() { fmt.Println(tok, err) fmt.Printf("(%v,%v)-(%v,%v)\n", tok.(*lexmachine.Token).TSLine, tok.(*lexmachine.Token).TSColumn, tok.(*lexmachine.Token).TELine, tok.(*lexmachine.Token).TEColumn, ) } ``` // TODO 优化位置标记计算