用Python實現 詞法分析器(Lexical Analyzer)
from __future__ import print_function
import sys
# following two must remain in the same order
tk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq, tk_Gtr, \
tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While, tk_Print, \
tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma, tk_Ident, \
tk_Integer, tk_String = range(31)
all_syms = ["End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add", "Op_subtract",
"Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater", "Op_greaterequal",
"Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or", "Keyword_if",
"Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc", "LeftParen",
"RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma", "Identifier",
"Integer", "String"]
# single character only symbols
symbols = { '{': tk_Lbrace, '}': tk_Rbrace, '(': tk_Lparen, ')': tk_Rparen, '+': tk_Add, '-': tk_Sub,
'*': tk_Mul, '%': tk_Mod, ';': tk_Semi, ',': tk_Comma }
key_words = {'if': tk_If, 'else': tk_Else, 'print': tk_Print, 'putc': tk_Putc, 'while': tk_While}
the_ch = " " # dummy first char - but it must be a space
the_col = 0
the_line = 1
input_file = None
#*** show error and exit
def error(line, col, msg):
print(line, col, msg)
exit(1)
#*** get the next character from the input
def next_ch():
global the_ch, the_col, the_line
the_ch = input_file.read(1)
the_col += 1
if the_ch == '\n':
the_line += 1
the_col = 0
return the_ch
#*** 'x' - character constants
def char_lit(err_line, err_col):
n = ord(next_ch()) # skip opening quote
if the_ch == '\'':
error(err_line, err_col, "empty character constant")
elif the_ch == '\\':
next_ch()
if the_ch == 'n':
n = 10
elif the_ch == '\\':
n = ord('\\')
else:
error(err_line, err_col, "unknown escape sequence \\%c" % (the_ch))
if next_ch() != '\'':
error(err_line, err_col, "multi-character constant")
next_ch()
return tk_Integer, err_line, err_col, n
#*** process divide or comments
def div_or_cmt(err_line, err_col):
if next_ch() != '*':
return tk_Div, err_line, err_col
# comment found
next_ch()
while True:
if the_ch == '*':
if next_ch() == '/':
next_ch()
return gettok()
elif len(the_ch) == 0:
error(err_line, err_col, "EOF in comment")
else:
next_ch()
#*** "string"
def string_lit(start, err_line, err_col):
text = ""
while next_ch() != start:
if len(the_ch) == 0:
error(err_line, err_col, "EOF while scanning string literal")
if the_ch == '\n':
error(err_line, err_col, "EOL while scanning string literal")
text += the_ch
next_ch()
return tk_String, err_line, err_col, text
#*** handle identifiers and integers
def ident_or_int(err_line, err_col):
is_number = True
text = ""
while the_ch.isalnum() or the_ch == '_':
text += the_ch
if not the_ch.isdigit():
is_number = False
next_ch()
if len(text) == 0:
error(err_line, err_col, "ident_or_int: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
if text[0].isdigit():
if not is_number:
error(err_line, err_col, "invalid number: %s" % (text))
n = int(text)
return tk_Integer, err_line, err_col, n
if text in key_words:
return key_words[text], err_line, err_col
return tk_Ident, err_line, err_col, text
#*** look ahead for '>=', etc.
def follow(expect, ifyes, ifno, err_line, err_col):
if next_ch() == expect:
next_ch()
return ifyes, err_line, err_col
if ifno == tk_EOI: 鄭州人流醫院哪家好
error(err_line, err_col, "follow: unrecognized character: (%d) '%c'" % (ord(the_ch), the_ch))
return ifno, err_line, err_col
#*** return the next token type
def gettok():
while the_ch.isspace():
next_ch()
err_line = the_line
err_col = the_col
if len(the_ch) == 0: return tk_EOI, err_line, err_col
elif the_ch == '/': return div_or_cmt(err_line, err_col)
elif the_ch == '\'': return char_lit(err_line, err_col)
elif the_ch == '<': return follow('=', tk_Leq, tk_Lss, err_line, err_col)
elif the_ch == '>': return follow('=', tk_Geq, tk_Gtr, err_line, err_col)
elif the_ch == '=': return follow('=', tk_Eq, tk_Assign, err_line, err_col)
elif the_ch == '!': return follow('=', tk_Neq, tk_Not, err_line, err_col)
elif the_ch == '&': return follow('&', tk_And, tk_EOI, err_line, err_col)
elif the_ch == '|': return follow('|', tk_Or, tk_EOI, err_line, err_col)
elif the_ch == '"': return string_lit(the_ch, err_line, err_col)
elif the_ch in symbols:
sym = symbols[the_ch]
next_ch()
return sym, err_line, err_col
else: return ident_or_int(err_line, err_col)
#*** main driver
input_file = sys.stdin
if len(sys.argv) > 1:
try:
input_file = open(sys.argv[1], "r", 4096)
except IOError as e:
error(0, 0, "Can't open %s" % sys.argv[1])
while True:
t = gettok()
tok = t[0]
line = t[1]
col = t[2]
print("%5d %5d %-14s" % (line, col, all_syms[tok]), end='')
if tok == tk_Integer: print(" %5d" % (t[3]))
elif tok == tk_Ident: print(" %s" % (t[3]))
elif tok == tk_String: print(' "%s"' % (t[3]))
else: print("")
if tok == tk_EOI:
break
輸出(測試用例三)
5 16 Keyword_print
5 40 Op_subtract
6 16 Keyword_putc
6 40 Op_less
7 16 Keyword_if
7 40 Op_greater
8 16 Keyword_else
8 40 Op_lessequal
9 16 Keyword_while
9 40 Op_greaterequal
10 16 LeftBrace
10 40 Op_equal
11 16 RightBrace
11 40 Op_notequal
12 16 LeftParen
12 40 Op_and
13 16 RightParen
13 40 Op_or
14 16 Op_subtract
14 40 Semicolon
15 16 Op_not
15 40 Comma
16 16 Op_multiply
16 40 Op_assign
17 16 Op_divide
17 40 Integer 42
18 16 Op_mod
18 40 String "String literal"
19 16 Op_add
19 40 Identifier variable_name
20 26 Integer 10
21 26 Integer 92
22 26 Integer 32
23 1 End_of_input
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69945560/viewspace-2669069/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 實現指令碼直譯器 - 詞法分析器指令碼詞法分析
- 詞法分析器詞法分析
- Lex詞法分析器詞法分析
- UVA12421 (Jiandan) Mua (I) - Lexical Analyzer題解
- Monkey 01 lexer 詞法分析器詞法分析
- 編譯器前端之如何實現基於DFA的詞法分析器編譯前端詞法分析
- 【編譯原理】手工打造詞法分析器編譯原理詞法分析
- Hanlp自然語言處理工具之詞法分析器HanLP自然語言處理詞法分析
- 【水汐の編譯原理】 詞法分析器 課題1編譯原理詞法分析
- [譯]用javascript實現一門程式語言-詞法分析JavaScript詞法分析
- 用切片操作實現的Python篩法Python
- python實現詞頻統計Python
- python 實現中文分詞統計Python中文分詞
- 編譯器實現之旅——第五章 實現語法分析器前的準備編譯語法分析
- 筆記六:通過 Analyzer 進行分詞筆記分詞
- 教你如何實現Python 過濾敏感詞Python
- 漢語言處理包HanLPv1.6.0釋出,感知機詞法分析器HanLP詞法分析
- 用python實現基於凝固度和自由度的新詞發現程式Python
- python使用jieba實現中文文件分詞和去停用詞PythonJieba分詞
- ES 筆記六:通過 Analyzer 進行分詞筆記分詞
- python實現簡單猜單詞遊戲Python遊戲
- Java 實現《編譯原理》簡單詞法分析功能Java編譯原理詞法分析
- 使用PHP實現詞法分析與自定義語言PHP詞法分析
- 依存句法分析器的簡單實現
- ElasticSearch7.3 學習之定製分詞器(Analyzer)Elasticsearch分詞
- ElasticSearch7.3學習(十五)----中文分詞器(IK Analyzer)及自定義詞庫Elasticsearch中文分詞
- 用WordCloud詞雲+LDA主題模型,帶你讀一讀《芳華》(python實現)CloudLDA模型Python
- 用laravel框架實現敏感詞彙過濾功能Laravel框架
- 【python】百度關鍵詞排名查詢實現Python
- 【數值方法-Python實現】Crout分解+追趕法實現Python
- 實踐003-elasticsearch之analyzerElasticsearch
- jvm-44-jvm 記憶體效能分析工具 Eclipse Memory Analyzer Tool (MAT) / 記憶體分析器 (MAT)JVM記憶體Eclipse
- this詞法
- 【編譯原理】手工打造語法分析器編譯原理語法分析
- elasticsearch + python 如何使用非全域性的 analyzerElasticsearchPython
- memray: Python的記憶體分析器Python記憶體
- 使用 FastText 實現詞嵌入AST
- 用Java寫編譯器(1)- 詞法和語法分析Java編譯語法分析