之前專案有一個需求,業務人員使用中文編寫一些自定義公式,然後需要我們後臺執行將結果返回到介面上,於是就基於有限狀態機寫了這個詞法分析器,比較簡單,希望能夠拋磚引玉。
一、分析需求
輸入中文公式,返回結果,比如:
現有薪資=10000;
個稅起點=3000;
當前年份=2021;
如果(當前年份=2022){
個稅起點=5000;
}
返回 (現有薪資-個稅起點) * 0.2;
二、實現需求
最初的想法是使用字串替換的方式,將中文關鍵字替換成php的關鍵字,然後呼叫eval執行,這樣確實也是可以的,但是總覺得不是很美麗,並且不能實現動態解析。就想著自己實現一個簡單的詞法分析,然後結合ast將詞法轉換成php程式碼執行,豈不快哉。當前版本沒有用到抽象語法樹來生成程式碼,全部使用字串拼接。
<?php
/**
* Class Lexer
* @package Sett\OaLang
* 詞法分析器
*/
class Lexer {
// 內建關鍵字集合
public $keywordList = [];
// 內建運算子集合
public $operatorList = [
"+", "-", "*", "/", "=", ">", "<", "!", "(", ")", "{", "}", ",", ";"
];
// 原始碼
private $input;
// 當前的字元
private $currChar;
// 當前字元位置
private $currCharPos = 0;
// 結束符
private $eof = "eof";
// 當前編碼
private $currEncode = "UTF-8";
// 內建關鍵字
public const VAR = "variable";
public const STR = "string";
public const KW = "keyword";
public const OPR = "operator";
public const INT = "integer";
public const NIL = "null";
/**
* Lexer constructor.
* @param string $input
*/
public function __construct(string $input) {
$this->input = $input;
$this->currChar = mb_substr($this->input, $this->currCharPos, 1);
}
/**
* @param array $keywordList
*/
public function setKeywordList($keywordList) {
$this->keywordList = $keywordList;
}
/**
* @return array
* @throws Exception
*/
public function parseInput() {
if ($this->input == "") {
throw new Exception("code can not be empty");
}
$tokens = [];
do {
$token = $this->nextToken();
if ($token["type"] != "eof") {
$tokens[] = $token;
}
if ($token["type"] == self::KW) {
$tokens[] = $this->makeToken(self::NIL, " ");
}
} while ($token["type"] != "eof");
return $tokens;
}
/**
* @return array
*/
public function nextToken() {
$this->skipBlankChar();
$this->currChar == "" && $this->currChar = $this->eof;
if ($this->isCnLetter()) {
$word = $this->matchUntilNextCharIsNotCn();
if ($this->isKeyword($word)) {
$this->currCharPos -= 1;
return $this->currToken(static::KW, $word);
}
// 不是關鍵字的全部歸為變數
return $this->makeToken(static::VAR, $word);
}
// 如果是運算子
if ($this->isOperator()) {
return $this->currToken(static::OPR, $this->currChar);
}
// 如果是數字
if ($this->isNumber()) {
return $this->currToken(static::INT, $this->currChar);
}
// 如果是字串
if ($str = $this->isStr()) {
return $this->currToken(static::STR, $str);
}
// 如果是變數
if ($this->isVar()) {
$word = $this->matchVar();
if ($this->isKeyword($word)) {
return $this->currToken(static::KW, $word);
}
return $this->makeToken(static::VAR, $word);
}
if ($this->currChar == $this->eof) {
return $this->currToken('eof', $this->currChar);
}
return $this->currToken(static::VAR, $this->currChar);
}
/**
* @param string $input
* @return string
*/
private function matchVar(string $input = "") {
$word = $input ?: '';
while ($this->isVar()) {
$word .= $this->currChar;
$this->nextChar();
}
return $word;
}
/**
* @return bool
* 是否為普通變數
*/
private function isVar() {
return $this->isCnLetter() || $this->isEnLetter();
}
/**
* 跳過空白字元
*/
private function skipBlankChar() {
while (ord($this->currChar) == 10 ||
ord($this->currChar) == 13 ||
ord($this->currChar) == 32) {
$this->nextChar();
}
}
/**
* @param string $type
* @param $word
* @return array
* 記錄當前token和下一個字元
*/
private function currToken(string $type, $word) {
$token = $this->makeToken($type, $word);
$this->nextChar();
return $token;
}
/**
* @param string $type
* @param string $char
* @return array
*/
private function makeToken(string $type, string $char) {
return ["type" => $type, "char" => $char, "pos" => $this->currCharPos];
}
/**
* @return bool
* 判斷是否是英文字元
*/
private function isEnLetter() {
if ($this->currChar == "" || $this->currChar == $this->eof) {
return false;
}
$ord = mb_ord($this->currChar, $this->currEncode);
if ($ord > ord('a') && $ord < ord('z')) {
return true;
}
return false;
}
/**
* @return false|int
* 是否中文字元
*/
private function isCnLetter() {
return preg_match("/^[\x{4e00}-\x{9fa5}]+$/u", $this->currChar);
}
/**
* @return bool
* 是否為數字
*/
private function isNumber() {
return is_numeric($this->currChar);
}
/**
* @return bool
* 是否是字串
*/
private function isStr() {
return $this->matchCompleteStr();
}
/**
* @return string
* 匹配完整字串
*/
private function matchCompleteStr() {
$char = "";
if ($this->currChar == "\"") {
$this->nextChar();
while ($this->currChar != "\"") {
if ($this->currChar != "\"") {
$char .= $this->currChar;
}
$this->nextChar();
}
return $char;
}
return $char;
}
/**
* @return bool
* 是否是運算子
*/
private function isOperator() {
return in_array($this->currChar, $this->operatorList);
}
/**
* @return string
* 匹配中文字元
*/
private function matchUntilNextCharIsNotCn() {
$char = "";
while ($this->isCnLetter()) {
$char .= $this->currChar;
$this->nextChar();
}
return $char;
}
/**
* @return void 獲取下一個字元
* 獲取下一個字元
*/
private function nextChar() {
$this->currCharPos += 1;
$this->currChar = mb_substr($this->input, $this->currCharPos, 1);
if ($this->currChar == "") {
$this->currChar = $this->eof;
}
}
/**
* @param string $input
* @return bool
* 是否是關鍵字
*/
private function isKeyword(string $input) {
return ($this->keywordList[$input] ?? "") != "";
}
public function convert(array $tokens) {
$code = "";
foreach ($this->lexerIterator($tokens) as $generator) {
switch ($generator["type"]) {
case static::KW:
$code .= $this->keywordList[$generator["char"]];
break;
case static::VAR:
$code .= sprintf("$%s", $generator["char"]);
break;
case static::OPR:
$code .= $this->replace($generator["char"]);
break;
case static::INT:
$code .= $generator["char"];
break;
case static::STR:
$code .= sprintf("\"%s\"", $generator["char"]);
break;
default:
$code .= $generator["char"];
}
}
return $code;
}
private function replace(string $char) {
return str_replace("+", ".", $char);
}
/**
* @param array $tokens
* @return \Generator
*/
private function lexerIterator(array $tokens) {
foreach ($tokens as $index => $token) {
yield $token;
}
}
}
三、如何使用
require __DIR__ . "/vendor/autoload.php";
// 定義一段程式碼
$code = <<<EOF
姓名="腕豪";
問候="你好啊";
地址=(1+2) * 3;
如果(地址 > 3){
地址=1;
}否則{
地址="艾歐尼亞"
}
說話 = ("我"+"愛")+"你";
返回 姓名+年齡;
EOF;
$lexer = new Lexer($code);
// 自定義你的關鍵字
$kwMap = [
"如果" => "if", "否則" => "else", "返回" => "return", "否則如果" => "elseif"
];
$lexer->setKeywordList($kwMap);
// 這裡是生成的詞
$tokens = $lexer->parseInput();
// 將生成的詞轉成php,當然你也可以嘗試用php-parse轉ast再轉成php,這裡只是簡單的拼接
var_dump($lexer->convert($tokens));
生成詞
[{
"type": "variable",
"char": "姓名",
"pos": 2
}, {
"type": "operator",
"char": "=",
"pos": 2
}, {
"type": "string",
"char": "腕豪",
"pos": 7
}, {
"type": "operator",
"char": ";",
"pos": 8
}, {
"type": "variable",
"char": "問候",
"pos": 13
}, {
"type": "operator",
"char": "=",
"pos": 13
}, {
"typ e": "string",
"char": "你好啊",
"pos": 17
}, {
"type": "operator",
"char": ";",
"pos": 18
}, {
"type": "variable",
"char": "地址",
"pos": 23
}, {
"type": "operator",
"char": "=",
"pos": 23
}, {
"type": "operator",
"char": "(",
"pos": 24
}, {
"type": "integer",
"char": "1",
"pos": 25
}, {
"type": "operator",
"char": " +",
"pos": 26
}, {
"type": "integer",
"char": "2",
"pos": 27
}, {
"type": "operator",
"char": ")",
"pos": 28
}, {
"type": "operator",
"char": "*",
"pos": 30
}, {
"type": "integer",
"char": "3",
"pos": 32
}, {
"type": "operator",
"char": ";",
"pos": 33
}, {
"type": "keyword",
"char": "如果",
"pos": 37
}, {
"type": "nul l",
"char": " ",
"pos": 38
}, {
"type": "operator",
"char": "(",
"pos": 38
}, {
"type": "variable",
"char": "地址",
"pos": 41
}, {
"type": "operator",
"char": ">",
"pos": 42
}, {
"type": "integer",
"char": "3",
"pos": 44
}, {
"type": "operator",
"char": ")",
"pos": 45
}, {
"type": "operator",
"char": "{",
"pos": 46
}, {
"type": "variable",
"char": "地址",
"pos": 55
}, {
"type": "operator",
"char": "=",
"pos": 55
}, {
"type": "integer",
"char": "1",
"pos": 56
}, {
"type": "operator",
"char": ";",
"pos": 57
}, {
"type": "operator",
"char": "}",
"pos": 60
}, {
"type": "keyword",
"char": "否則",
"pos": 62
}, {
"type": "null",
"char ": " ",
"pos": 63
}, {
"type": "operator",
"char": "{",
"pos": 63
}, {
"type": "variable",
"char": "地址",
"pos": 72
}, {
"type": "operator",
"char": "=",
"pos": 72
}, {
"type": "string",
"char": "艾歐尼亞",
"pos": 78
}, {
"type": "operator",
"char": ";",
"pos": 79
}, {
"type": "operator",
"char": "}",
"pos": 82
}, {
"type": "variable",
"char": "說話",
"pos": 87
}, {
"type": "operator",
"char": "=",
"pos": 88
}, {
"type": "operator",
"char": "(",
"pos": 90
}, {
"type": "string",
"char": "我",
"pos": 93
}, {
"type": "operator",
"char": "+",
"pos": 94
}, {
"type": "string",
"char": "愛",
"pos": 97
}, {
"type": "operator",
"char": ")",
"pos": 98
}, {
"type": "operator",
"char": "+",
"pos": 99
}, {
"type": "string",
"char": "你",
"pos": 102
}, {
"type": "operator",
"char": ";",
"pos": 103
}, {
"type": "keyword",
"char": "返回",
"pos": 107
}, {
"type": "null",
"char": " ",
"pos": 108
}, {
"type": "variable",
"char": "姓名",
"pos": 111
}, {
"typ e": "operator",
"char": "+",
"pos": 111
}, {
"type": "variable",
"char": "年齡",
"pos": 114
}, {
"type": "operator",
"char": ";",
"pos": 114
}]
輸出:
$姓名="腕豪";$問候="你好啊";$地址=(1.2)*3;if ($地址>3){$地址=1;}else {$地址="艾歐尼亞";}$說話=("我"."愛")."你";return $姓名.$年齡;
能執行嗎?當然能。還存在一些小bug,不想改了。
四、使用場景
什麼,居然有人說沒什麼用?oa系統總有用到的時候。
本作品採用《CC 協議》,轉載必須註明作者和本文連結