手把手带你从零开始实现一个编译器
https://www.cnblogs.com/abinzhao/p/18748462
其实我之前写过关于编译器方面的文章,昨天写了一篇关于通过自制适合自己的JavaScript语法的文章,但是被某个掘友说不懂编译,误人子弟,本来我不想理会这种事儿,可实在是气不过,我又不是他爸妈,凭什么惯着他,我最讨厌的就是这种在现实中过的不如意,网上找存在感的人。
当然以上都不是主要原因,主要还是掘友杨永安提醒了我,确实有一部分刚入门的兄弟,可能对于编译器还是不大了解,因此专门出了这一篇文章,主要介绍一下关于编译器的原理以及如何一步步实现编译器,当然写的肯定不如大神们,毕竟我也是菜鸟,要是有大神看到有问题的地方请指正,我会虚心接受,并虚心请教学习的。
编译器的基础知识介绍
编译器是将高级语言转换为机器语言的程序。编译器的基本流程包括词法分析、语法分析、语义检查以及目标代码生成,其本质是在不同的抽象层级之间进行翻译。在编译器的实现中,我们需要掌握以下知识点:
- 语言语法设计和分析
- 有限状态自动机和正则表达式
- 上下文无关文法和语法树
- 符号表和类型检测
- 中间代码和目标代码生成
语言语法的分析和设计
在实现编译器之前,我们需要先设计一种编程语言,并确定其语法规则。本文中我们将使用简单的类C语言来作为编译器的目标语言,其语法规则定义如下:
<program> ::= <declaration-list> | |
<declaration-list> ::= <declaration> | <declaration-list> <declaration> | |
<declaration> ::= <type> <identifier-list>; | |
<type> ::= int | float | char | double | |
<identifier-list> ::= <identifier> | <identifier-list> , <identifier> | |
<identifier> ::= <letter> | <identifier> <letter> | <identifier> <digit> | |
<letter> ::= a | b | ... | z | A | B | ... | Z | _ | |
<digit> ::= 0 | 1 | ... | 9 |
该语言支持四种类型:int、float、char、double,并且支持以分号结束的多个声明。
词法分析器的实现
词法分析器是编译器的一个重要模块,用于将输入的程序代码分解成一系列有意义的单词,称为Token。Token 可以是关键字、标识符、运算符、分隔符、常量等。
我们使用正则表达式来描述程序代码中的各种单词模式。在本文中,我们将实现一个使用有限状态自动机实现的简单的词法分析器,其代码如下:
# coding=utf-8 | |
import enum | |
class TokenType(enum.Enum): | |
# 关键字 | |
INT = 0 | |
FLOAT = 1 | |
CHAR = 2 | |
DOUBLE = 3 | |
# 标识符 | |
IDENTIFIER = 4 | |
# 运算符 | |
PLUS = 5 | |
MINUS = 6 | |
TIMES = 7 | |
DIVIDE = 8 | |
# 分隔符 | |
COMMA = 9 | |
SEMI = 10 | |
# 常量 | |
INT_CONST = 11 | |
FLOAT_CONST = 12 | |
CHAR_CONST = 13 | |
DOUBLE_CONST = 14 | |
class Token: | |
def __init__(self, token_type, value): | |
self.token_type = token_type | |
self.value = value | |
class Lexer: | |
def __init__(self, text): | |
self.tokens = [] | |
self.text = text | |
self.pos = 0 | |
self.current_char = self.text[self.pos] | |
def error(self): | |
raise Exception("Invalid character") | |
def advance(self): | |
self.pos += 1 | |
if self.pos > len(self.text) - 1: | |
self.current_char = None | |
else: | |
self.current_char = self.text[self.pos] | |
def skip_whitespace(self): | |
while self.current_char is not None and self.current_char.isspace(): | |
self.advance() | |
def integer(self): | |
result = "" | |
while self.current_char is not None and self.current_char.isdigit(): | |
result += self.current_char | |
self.advance() | |
return int(result)def float_number(self): | |
result = "" | |
while self.current_char is not None and (self.current_char.isdigit() or self.current_char == "."): | |
result += self.current_char | |
self.advance() | |
return float(result) | |
def string(self): | |
result = "" | |
self.advance() | |
while self.current_char is not None and self.current_char != '"': | |
result += self.current_char | |
self.advance() | |
self.advance() | |
return result | |
def get_next_token(self): | |
while self.current_char is not None: | |
if self.current_char.isspace(): | |
self.skip_whitespace() | |
continue | |
if self.current_char == ",": | |
self.advance() | |
return Token(TokenType.COMMA, ",") | |
if self.current_char == ";": | |
self.advance() | |
return Token(TokenType.SEMI, ";") | |
if self.current_char.isalpha() or self.current_char == "_": | |
result = "" | |
while self.current_char is not None and (self.current_char.isalnum() or self.current_char == "_"): | |
result += self.current_char | |
self.advance() | |
if result == "int": | |
return Token(TokenType.INT, result) | |
elif result == "float": | |
return Token(TokenType.FLOAT, result) | |
elif result == "char": | |
return Token(TokenType.CHAR, result) | |
elif result == "double": | |
return Token(TokenType.DOUBLE, result) | |
else: | |
return Token(TokenType.IDENTIFIER, result) | |
if self.current_char.isdigit(): | |
result = self.integer() | |
if self.current_char == ".": | |
self.advance() | |
result = float(str(result) + "." + str(self.integer())) | |
return Token(TokenType.FLOAT_CONST, result) | |
else: | |
return Token(TokenType.INT_CONST, result) | |
if self.current_char == '"': | |
return Token(TokenType.CHAR_CONST, self.string()) | |
if self.current_char == "+": | |
self.advance() | |
return Token(TokenType.PLUS, "+") | |
if self.current_char == "-": | |
self.advance() | |
return Token(TokenType.MINUS, "-") | |
if self.current_char == "*": | |
self.advance() | |
return Token(TokenType.TIMES, "*") | |
if self.current_char == "/": | |
self.advance() | |
return Token(TokenType.DIVIDE, "/") | |
self.error() | |
return Token(TokenType.EOF, None) |
语法分析器的实现
在词法分析器的基础上,我们进一步构建语法分析器。语法分析器的作用是将 Token 序列解析成语法树。在实现语法分析器时,我们可以利用上下文无关文法和递归下降分析的方法。在本文中,我们将按照一定的优先级处理不同类型的运算符,并将其转换为对应的语法树节点。其代码如下:
class AST: | |
pass | |
class BinOp(AST): | |
def __init__(self, left, op, right): | |
self.left = left | |
self.token = self.op = op | |
self.right = right | |
class Num(AST): | |
def __init__(self, token): | |
self.token = token | |
self.value = token.value | |
class String(AST): | |
def __init__(self, token): | |
self.token = token | |
self.value = token.value | |
class UnaryOp(AST): | |
def __init__(self, op, expr): | |
self.token = self.op = op | |
self.expr = expr | |
class VarDecl(AST): | |
def __init__(self, var_type, var_name): | |
self.var_type = var_type | |
self.var_name = var_name | |
class Var(AST): | |
def __init__(self, token): | |
self.token = token | |
self.value = token.value | |
class Assign(AST): | |
def __init__(self, left, op, right): | |
self.left = left | |
self.token = self.op = op | |
self.right = right | |
class Compound(AST): | |
def __init__(self): | |
self.children = [] | |
class Parser: | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.current_token = self.lexer.get_next_token() | |
def error(self): | |
raise Exception("Invalid syntax") | |
def eat(self, token_type): | |
if self.current_token.token_type == token_type: | |
self.current_token = self.lexer.get_next_token() | |
else: | |
self.error() | |
def factor(self): | |
token = self.current_token | |
if token.token_type == TokenType.PLUS: | |
self.eat(TokenType.PLUS) | |
node = UnaryOp(token, self.factor()) | |
return node | |
elif token.token_type == TokenType.MINUS: | |
self.eat(TokenType.MINUS) | |
node = UnaryOp(token, self.factor()) | |
return node | |
elif token.token_type == TokenType.INT_CONST: | |
self.eat(TokenType.INT_CONST) | |
return Num(token) | |
elif token.token_type == TokenType.FLOAT_CONST: | |
self.eat(TokenType.FLOAT_CONST) | |
return Num(token) | |
elif token.token_type == TokenType.CHAR_CONST: | |
self.eat(TokenType.CHAR_CONST) | |
return String(token) | |
elif token.token_type == TokenType.LPAREN: | |
self.eat(TokenType.LPAREN) | |
node = self.expr() | |
self.eat(TokenType.RPAREN) | |
return node | |
else: | |
return self.variable() | |
def term(self): | |
node = self.factor() | |
while self.current_token.token_type in (TokenType.TIMES, TokenType.DIVIDE): | |
token = self.current_token | |
if token.token_type == TokenType.TIMES: | |
self.eat(TokenType.TIMES) | |
elif token.token_type == TokenType.DIVIDE: | |
self.eat(TokenType.DIVIDE) | |
node = BinOp(left=node, op=token, right=self.factor()) | |
return node | |
def expr(self): | |
node = self.term() | |
while self.current_token.token_type in (TokenType.PLUS, TokenType.MINUS): | |
token = self.current_token | |
if token.token_type == TokenType.PLUS: | |
self.eat(TokenType.PLUS) | |
elif token.token_type == TokenType.MINUS: | |
self.eat(TokenType.MINUS) | |
node = BinOp(left=node, op=token, right=self.term()) | |
return node | |
def variable(self): | |
var_token = self.current_token | |
self.eat(TokenType.IDENTIFIER) | |
return Var(var_token) | |
def var_declaration(self): | |
self.eat(TokenType.INT) | |
var_type = VarDecl(TokenType.INT, None) | |
var_names = [Var(self.current_token)] | |
self.eat(TokenType.IDENTIFIER) | |
while self.current_token.token_type == TokenType.COMMA: | |
self.eat(TokenType.COMMA) | |
var_names.append(Var(self.current_token)) | |
self.eat(TokenType.IDENTIFIER) | |
self.eat(TokenType.SEMI) | |
return [var_type, var_names] | |
def statement(self): | |
if self.current_token.token_type == TokenType.INT: | |
decl_list = [] | |
var_decl = self.var_declaration() | |
decl_list.append(VarDecl(var_decl[0], var_decl[1])) | |
while self.current_token.token_type == TokenType.IDENTIFIER: | |
var_decl = self.var_declaration() | |
decl_list.append(VarDecl(var_decl[0], var_decl[1])) | |
return decl_list | |
else: | |
left = self.variable() | |
token = self.current_token | |
self.eat(TokenType.ASSIGN) | |
right = self.expr() | |
node = Assign(left=left, op=token, right=right) | |
return node | |
def parse(self): | |
node = Compound() | |
while self.current_token.token_type != TokenType.EOF: | |
if self.current_token.token_type == TokenType.SEMI: | |
self.eat(TokenType.SEMI) | |
elif self.current_token.token_type == TokenType.IDENTIFIER: | |
node.children.extend(self.statement()) | |
else: | |
node.children.append(self.expr()) | |
return node |
语义检查器的实现
语义检查器是编译器的另一个重要模块,用于检查程序代码在语义上是否正确。在实现语义检查器时,我们需要进行类型检查、符号引用和作用域判断等操作,以保证程序在执行过程中的正确性。在本文中,我们将利用语法树对程序进行语义检查,并在发现错误时立即抛出异常。其代码如下:
class NodeVisitor: | |
def visit(self, node): | |
method_name = f'visit_{type(node).__name__}' | |
visitor = getattr(self, method_name, self.generic_visit) | |
return visitor(node) | |
def generic_visit(self, node): | |
print(f"No visit_{type(node).__name__} method implemented") | |
class Symbol: | |
def __init__(self, name, type=None): | |
self.name = name | |
self.type = type | |
class ScopedSymbolTable: | |
def __init__(self, scope_name, scope_level, enclosing_scope=None): | |
self._symbols = {} | |
self.scope_name = scope_name | |
self.scope_level = scope_level | |
self.enclosing_scope = enclosing_scope | |
def __str__(self): | |
h1 = "SCOPE (SCOPED SYMBOL TABLE)" | |
lines = ["\n", h1, "-" * len(h1)] | |
for header_name, header_value in ( | |
("Scope name", self.scope_name), | |
("Scope level", self.scope_level), | |
("Enclosing scope", self.enclosing_scope.scope_name if self.enclosing_scope else None), | |
): | |
lines.append(f"{header_name:<15}: {header_value}") h2 = "Scope (Scoped symbol table) contents" | |
lines.extend([h2, "-" * len(h2)]) | |
lines.extend(f"{key:>7}: {value}" for key, value in self._symbols.items()) | |
lines.append("\n") | |
s = "\n".join(lines) | |
return s | |
def insert(self, symbol): | |
self._symbols[symbol.name] = symbol | |
def lookup(self, name, current_scope_only=False): | |
symbol = self._symbols.get(name) | |
if symbol is not None: | |
return symbol | |
if current_scope_only: | |
return None | |
if self.enclosing_scope is not None: | |
return self.enclosing_scope.lookup(name) | |
return None | |
class SemanticAnalyzer(NodeVisitor): | |
def __init__(self): | |
self.current_scope = None | |
def visit_Compound(self, node): | |
for child in node.children: | |
self.visit(child) | |
def visit_BinOp(self, node): | |
self.visit(node.left) | |
self.visit(node.right) | |
if node.op.token_type in (TokenType.PLUS, TokenType.MINUS, TokenType.TIMES, TokenType.DIVIDE): | |
if node.left.type != node.right.type: | |
raise Exception("Type mismatch in binary operator") | |
node.type = node.left.type | |
elif node.op.token_type == TokenType.ASSIGN: | |
if not isinstance(node.left, Var): | |
raise Exception(f"{node.left} is not assignable") | |
if node.left.type != node.right.type: | |
raise Exception("Type mismatch in assignment") | |
def visit_Num(self, node): | |
node.type = TokenType.INT | |
def visit_String(self, node): | |
node.type = TokenType.CHAR | |
def visit_VarDecl(self, node): | |
var_name = node.var_name.value | |
if self.current_scope.lookup(var_name, current_scope_only=True): | |
raise Exception(f"{var_name} already declared in current scope") | |
self.current_scope.insert(Symbol(var_name, node.var_type)) | |
def visit_Var(self, node): | |
var_name = node.value | |
symbol = self.current_scope.lookup(var_name) | |
if symbol is None: | |
raise Exception(f"{var_name} is not defined") | |
node.type = symbol.type | |
def visit_Assign(self, node): | |
self.visit(node.left) | |
self.visit(node.right) | |
def visit_Program(self, node): | |
global_scope = ScopedSymbolTable(scope_name='global', scope_level=1, enclosing_scope=self.current_scope) | |
self.current_scope = global_scope | |
self.visit(node.block) | |
self.current_scope = self.current_scope.enclosing_scope |
目标代码生成器的实现
目标代码生成器是编译器的最后一步,其目的是将语法树转换为目标机器的汇编代码。在实现目标代码生成器时,我们需要根据目标机器的指令集和内存结构生成相应的汇编指令,以完成对程序的最终翻译。在本文中,我们将使用类C语言作为目标语言,将其翻译为x86汇编代码。并且使用Intel 8086处理器作为目标机器。
由于目标机器的处理能力限制,在实现代码生成器时,我们需要做到尽可能的优化,以达到更好的程序性能。在本文中,我们将实现一些简单的代码优化技术,例如常量折叠和寄存器分配。其代码如下:
class X86CodeGenerator(NodeVisitor): | |
def __init__(self): | |
self.code = [] | |
self.label_count = 0 | |
self.data_section = [".data\n"] | |
self.rodata_section = [".section .rodata\n"] | |
self.bss_section = [".bss\n"] | |
self.current_scope = None | |
def new_label(self): | |
self.label_count += 1 | |
return f"L{self.label_count - 1}" | |
def emit(self, code): | |
self.code.append(code) | |
def emit_data(self, data): | |
self.data_section.append(data) | |
def emit_rodata(self, data): | |
self.rodata_section.append(data) | |
def emit_bss(self, data): | |
self.bss_section.append(data) | |
def visit_Compound(self, node): | |
for child in node.children: | |
self.visit(child) | |
def visit_BinOp(self, node): | |
self.visit(node.left) | |
self.visit(node.right) | |
if node.op.token_type == TokenType.PLUS: | |
self.emit("pop ecx") | |
self.emit("pop eax") | |
self.emit("add eax, ecx") | |
self.emit("push eax") | |
elif node.op.token_type == TokenType.MINUS: | |
self.emit("pop ecx") | |
self.emit("pop eax") | |
self.emit("sub eax, ecx") | |
self.emit("push eax") | |
elif node.op.token_type == TokenType.TIMES: | |
self.emit("pop ecx") | |
self.emit("pop eax") | |
self.emit("imul ecx") | |
self.emit("push eax") | |
elif node.op.token_type == TokenType.DIVIDE: | |
self.emit("mov edx, 0") | |
self.emit("pop ecx") | |
self.emit("pop eax") | |
self.emit("div ecx") | |
self.emit("push eax") | |
def visit_Num(self, node): | |
self.emit(f"push {node.value}") | |
def visit_String(self, node): | |
data_label = f".LC{len(self.rodata_section)}" | |
self.emit(f"lea eax, dword ptr [{data_label}]") | |
self.emit(f"push eax") | |
self.emit_rodata(f"{data_label}: db \"{node.value}\",0") | |
def visit_VarDecl(self, node): | |
pass | |
def visit_Var(self, node): | |
symbol = self.current_scope.lookup(node.value) | |
if "ebp-" in symbol.name: | |
self.emit(f"lea eax, dword ptr [{symbol.name}]") | |
self.emit(f"push eax") | |
else: | |
self.emit(f"push {symbol.name}") | |
def visit_Assign(self, node): | |
self.visit(node.right) | |
symbol = self.current_scope.lookup(node.left.value) | |
if "ebp-" in symbol.name: | |
self.emit("pop eax") | |
self.emit(f"mov dword ptr [{symbol.name}], eax") | |
else: | |
self.emit(f"mov {symbol.name}, eax") | |
def visit_Program(self, node): | |
self.emit(".intel_syntax noprefix") | |
self.emit(".text") | |
self.emit(".globl _start") | |
self.emit("_start:") | |
self.visit(node.block) | |
self.emit("mov eax, 1") | |
self.emit("xor ebx, ebx") | |
self.emit("int 0x80") | |
self.emit("\n") | |
self.emit_data("".join(self.data_section)) | |
self.emit("\n") | |
self.emit_rodata("".join(self.rodata_section)) | |
self.emit("\n") | |
self.emit_bss("".join(self.bss_section)) |
总结
本文介绍了一个简单的编译器的实现过程,包括词法分析器、语法分析器、语义检查器和目标代码生成器。虽然这个编译器实现的语言很简单,但其实现过程可以帮助我们更好地理解编译器的工作原理,从而更好地实现更复杂的编程语言。
在实现编译器的过程中,我们发现编译器是一个功能非常强大的工具,其可以将高级语言翻译为底层机器语言,并帮助我们解决许多代码开发中的问题。同时,编译器的实现也是一个高度技术性的过程,需要对计算机系统、编程语言和编译原理等方面有深入的理解和掌握。
本文中的代码实现虽然比较简单,但其思路和实现方法可以帮助我们更好地理解编译器的工作原理。在实际的编程中,建议使用现有的编译器工具,避免自己重新造轮子。不过,对于想要深入了解编译原理的同学,可以尝试实现更复杂的编程语言编译器,以提升自己的编程技术水平。
本文来自博客园,作者:纯爱掌门人,转载请注明原文链接:https://www.cnblogs.com/abinzhao/p/18748462
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/908176.shtml
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!