詞法分析器

bxtkezhan發表於2021-05-08

維基百科介紹:詞法分析是電腦科學中將字元序列轉換為標記序列的過程。進行詞法分析的程式或者函式叫作詞法分析器。️

有如下原始程式程式碼

add_result = 1 + 2

通過詞法分析得到以下結果

NAME   `add_result` 0,  0
SYMBOL `=`          0, 11
INT    `1`          0, 13
SYMBOL `+`          0, 15
INT    `2`          0, 17

我們可以利用Go語言輕鬆實現可用的詞法分析器 ?️


Go語言實現詞法分析器

package main

import (
    "fmt"
    "regexp"
    "unicode/utf8"
    "os"
)

var exprs = []string{"\\d+", "[\\p{L}\\d_]+", "[\\+\\-=]"}
var names = []string{"INT",  "NAME",         "SYMBOL"}

func main() {
    rules := []*regexp.Regexp{}
    for i, expr := range exprs {
        rule, _ := regexp.Compile("^" + expr)
        rules = append(rules, rule)
        fmt.Println(names[i], rule)
    }

    fmt.Println("--------------------------------")
    for row, code := range os.Args[1:] {
        position := 0
        col := 0
        for true {
            for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
                position += 1
                col += 1
            }
            if position >= len(code) {
                break
            }
            source := ""
            tokenType := -1
            for i, rule := range rules {
                source = rule.FindString(code[position:])
                if source != "" {
                    tokenType = i
                    break
                }
            }
            if tokenType >= 0 {
                fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
                position += len(source)
                col += utf8.RuneCountInString(source)
            } else {
                fmt.Printf("error in: %d, %d\n", row, col)
                break
            }
        }
    }

}

在命令列中執行測試

➜ go run lexer.go "數值 = PI + 100"
INT        ^\d+
NAME    ^[\p{L}\d_]+
SYMBOL    ^[\+-=]
--------------------------------
NAME    `數值`    0    0
SYMBOL    `=`        0    3
NAME    `PI`    0    5
SYMBOL    `+`        0    8
INT        `100`    0    10

Go語言程式碼說明

引入需要用到的包:

package main

import (
    "fmt"
    "regexp"
    "unicode/utf8"
    "os"
)
  • fmt 用於列印輸出
  • regexp 正規表示式
  • unicode/utf8 統計utf8的符文數量
  • os 獲取使用者輸入

指定正規表示式和欄位型別名稱:

var exprs = []string{"\\d+", "[\\p{L}\\d_]+", "[\\+\\-=]"}
var names = []string{"INT",  "NAME",         "SYMBOL"}

建立兩個字串陣列分別用於儲存正規表示式與對應的欄位型別名稱。

初始化欄位匹配規則:

func main() {
    rules := []*regexp.Regexp{}
    for i, expr := range exprs {
        rule, _ := regexp.Compile("^" + expr)
        rules = append(rules, rule)
        fmt.Println(names[i], rule)
    }

需要注意的是必須為每一個正規表示式頭前插入^用來確保匹配的字串包括最左邊的一個字元,避免“跳躍匹配”。

迴圈匹配欄位:

for row, code := range os.Args[1:] {
    position := 0
    col := 0
    for true {
        for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
            position += 1
            col += 1
        }
        if position >= len(code) {
            break
        }
        source := ""
        tokenType := -1
        for i, rule := range rules {
            source = rule.FindString(code[position:])
            if source != "" {
                tokenType = i
                break
            }
        }
        if tokenType >= 0 {
            fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
            position += len(source)
            col += utf8.RuneCountInString(source)
        } else {
            fmt.Printf("error in: %d, %d\n", row, col)
            break
        }
    }
}

使用遍歷os.Args[1:]的方法將使用者輸入的每一個引數作為一行程式碼進行詞法分析。

跳過【忽略】空字元:

for position < len(code) && (code[position] == ' ' || code[position] == '\t') {
    position += 1
    col += 1
}

因為我們的正規表示式必須匹配最左邊的一個字元所以需要跳過一些常常沒有意義的空字元。

判斷是否需要中斷迴圈:

if position >= len(code) {
    break
}

遍歷匹配規則嘗試匹配:

source := ""
tokenType := -1
for i, rule := range rules {
    source = rule.FindString(code[position:])
    if source != "" {
        tokenType = i
        break
    }
}

迴圈遍歷設定的規則進行匹配,如果成功則將下標設定為tokenType的值,如果始終沒有匹配則tokenType預設-1

根據匹配結果判斷後續行為:

if tokenType >= 0 {
    fmt.Printf("%s\t`%s`\t%d\t%d\n", names[tokenType], source, row, col)
    position += len(source)
    col += utf8.RuneCountInString(source)
} else {
    fmt.Printf("error in: %d, %d\n", row, col)
    break
}

如果tokenType不為-1,則匹配成功,將列印欄位名稱,字面量,行列資訊,並且設定position使之跳過當前欄位,需要注意下一個欄位起始的列號col的增量需要使用utf8的符文計數方法獲得,否則遇到一些unicode/utf8編碼將無法得到正確指向。

Python使用者也可以輕鬆的實現 ?️


Python詞法分析器

import re
import sys


exprs = ['\\d+', '\\w+', '[\\+\\-=]']
names = ['INT',  'NAME', 'SYMBOL']


def main():
    rules = []
    for i, expr in enumerate(exprs):
        rules.append(re.compile('^' + expr))
        print(names[i], rules[-1].pattern)

    print('-' * 32)
    for row, code in enumerate(sys.argv[1:]):
        position = 0
        while True:
            while position < len(code) and (code[position] == ' ' or code[position] == '\t'):
                position += 1
            if position >= len(code):
                break

            source = ''
            tokenType = -1
            for i, rule in enumerate(rules):
                result = rule.findall(code[position:])
                if len(result) > 0:
                    source = result[0]
                    tokenType = i
                    break
            if tokenType >= 0:
                print(f'{names[tokenType]}\t`{source}`\t{row}\t{position}')
                position += len(source)
            else:
                print(f'error in {row}, {position}')
                break


if __name__ == "__main__":
    main()

作為補充內容這裡也提供C++方案 ?️


C++實現詞法分析器

#include <locale>
#include <regex>
#include <string>
#include <vector>
#include <codecvt>


std::vector<std::wstring> exprs{L"\\d+", L"\\w+", L"[\\+\\-=]"};
std::vector<std::string> names{"INT",  "NAME", "SYMBOL"};


int main(int argc, char *argv[]) {
    std::locale old;
    std::locale::global(std::locale("en_US.UTF-8"));
    std::wstring_convert<std::codecvt_utf8<wchar_t>> codecvt_utf8;

    std::vector<std::wregex> rules;
    for (size_t i = 0, count = exprs.size(); i < count; ++i) {
        rules.push_back(std::wregex(L"^" + exprs[i]));
        printf("%s ^%s\n", names[i].c_str(), codecvt_utf8.to_bytes(exprs[i]).c_str());
    }

    printf("--------------------------------\n");
    for (int row = 0; row < argc - 1; ++row) {
        std::wstring code = codecvt_utf8.from_bytes(argv[row + 1]);
        size_t position = 0;
        while (true) {
            while (position < code.size() && (code[position] == L' ' || code[position] == L'\t'))
                position += 1;
            if (position >= code.size())
                break;

            auto subcode = code.substr(position);
            std::wsmatch match;
            int tokenType = -1;
            for (size_t i = 0, count = rules.size(); i < count; ++i) {
                if (std::regex_search(subcode, match, rules[i])) {
                    tokenType = i;
                    break;
                }
            }

            if (tokenType >= 0) {
                auto source = match.str(0);
                printf("%s\t`%s`\t%d\t%ld\n",
                    names[tokenType].c_str(), codecvt_utf8.to_bytes(source).c_str(), row, position);
                position += source.size();
            } else {
                printf("error in: %d, %ld\n", row, position);
                break;
            }
        }
    }

    std::locale::global(old);
    return 0;
}

相關文章