手把手带你从零开始实现一个编译器

其实我之前写过关于编译器方面的文章，昨天写了一篇关于通过自制适合自己的JavaScript语法的文章，但是被某个掘友说不懂编译，误人子弟，本来我不想理会这种事儿，可实在是气不过，我又不是他爸妈，凭什么惯着他，我最讨厌的就是这种在现实中过的不如意，网上找存在感的人。

当然以上都不是主要原因，主要还是掘友杨永安提醒了我，确实有一部分刚入门的兄弟，可能对于编译器还是不大了解，因此专门出了这一篇文章，主要介绍一下关于编译器的原理以及如何一步步实现编译器，当然写的肯定不如大神们，毕竟我也是菜鸟，要是有大神看到有问题的地方请指正，我会虚心接受，并虚心请教学习的。

编译器的基础知识介绍

编译器是将高级语言转换为机器语言的程序。编译器的基本流程包括词法分析、语法分析、语义检查以及目标代码生成，其本质是在不同的抽象层级之间进行翻译。在编译器的实现中，我们需要掌握以下知识点：

语言语法设计和分析
有限状态自动机和正则表达式
上下文无关文法和语法树
符号表和类型检测
中间代码和目标代码生成

语言语法的分析和设计

在实现编译器之前，我们需要先设计一种编程语言，并确定其语法规则。本文中我们将使用简单的类C语言来作为编译器的目标语言，其语法规则定义如下：

	<program> ::= <declaration-list>
	<declaration-list> ::= <declaration> \| <declaration-list> <declaration>
	<declaration> ::= <type> <identifier-list>;
	<type> ::= int \| float \| char \| double
	<identifier-list> ::= <identifier> \| <identifier-list> , <identifier>
	<identifier> ::= <letter> \| <identifier> <letter> \| <identifier> <digit>
	<letter> ::= a \| b \| ... \| z \| A \| B \| ... \| Z \| _
	<digit> ::= 0 \| 1 \| ... \| 9

该语言支持四种类型：int、float、char、double，并且支持以分号结束的多个声明。

词法分析器的实现

词法分析器是编译器的一个重要模块，用于将输入的程序代码分解成一系列有意义的单词，称为Token。Token 可以是关键字、标识符、运算符、分隔符、常量等。

我们使用正则表达式来描述程序代码中的各种单词模式。在本文中，我们将实现一个使用有限状态自动机实现的简单的词法分析器，其代码如下：

	# coding=utf-8

	import enum

	class TokenType(enum.Enum):
	# 关键字
	INT = 0
	FLOAT = 1
	CHAR = 2
	DOUBLE = 3

	# 标识符
	IDENTIFIER = 4

	# 运算符
	PLUS = 5
	MINUS = 6
	TIMES = 7
	DIVIDE = 8

	# 分隔符
	COMMA = 9
	SEMI = 10

	# 常量
	INT_CONST = 11
	FLOAT_CONST = 12
	CHAR_CONST = 13
	DOUBLE_CONST = 14


	class Token:
	def __init__(self, token_type, value):
	self.token_type = token_type
	self.value = value


	class Lexer:
	def __init__(self, text):
	self.tokens = []
	self.text = text
	self.pos = 0
	self.current_char = self.text[self.pos]

	def error(self):
	raise Exception("Invalid character")

	def advance(self):
	self.pos += 1
	if self.pos > len(self.text) - 1:
	self.current_char = None
	else:
	self.current_char = self.text[self.pos]

	def skip_whitespace(self):
	while self.current_char is not None and self.current_char.isspace():
	self.advance()

	def integer(self):
	result = ""
	while self.current_char is not None and self.current_char.isdigit():
	result += self.current_char
	self.advance()
	return int(result)def float_number(self):
	result = ""
	while self.current_char is not None and (self.current_char.isdigit() or self.current_char == "."):
	result += self.current_char
	self.advance()
	return float(result)

	def string(self):
	result = ""
	self.advance()
	while self.current_char is not None and self.current_char != '"':
	result += self.current_char
	self.advance()
	self.advance()
	return result

	def get_next_token(self):
	while self.current_char is not None:

	if self.current_char.isspace():
	self.skip_whitespace()
	continue

	if self.current_char == ",":
	self.advance()
	return Token(TokenType.COMMA, ",")

	if self.current_char == ";":
	self.advance()
	return Token(TokenType.SEMI, ";")

	if self.current_char.isalpha() or self.current_char == "_":
	result = ""
	while self.current_char is not None and (self.current_char.isalnum() or self.current_char == "_"):
	result += self.current_char
	self.advance()

	if result == "int":
	return Token(TokenType.INT, result)
	elif result == "float":
	return Token(TokenType.FLOAT, result)
	elif result == "char":
	return Token(TokenType.CHAR, result)
	elif result == "double":
	return Token(TokenType.DOUBLE, result)
	else:
	return Token(TokenType.IDENTIFIER, result)

	if self.current_char.isdigit():
	result = self.integer()
	if self.current_char == ".":
	self.advance()
	result = float(str(result) + "." + str(self.integer()))
	return Token(TokenType.FLOAT_CONST, result)
	else:
	return Token(TokenType.INT_CONST, result)

	if self.current_char == '"':
	return Token(TokenType.CHAR_CONST, self.string())

	if self.current_char == "+":
	self.advance()
	return Token(TokenType.PLUS, "+")

	if self.current_char == "-":
	self.advance()
	return Token(TokenType.MINUS, "-")

	if self.current_char == "*":
	self.advance()
	return Token(TokenType.TIMES, "*")

	if self.current_char == "/":
	self.advance()
	return Token(TokenType.DIVIDE, "/")

	self.error()

	return Token(TokenType.EOF, None)

语法分析器的实现

在词法分析器的基础上，我们进一步构建语法分析器。语法分析器的作用是将 Token 序列解析成语法树。在实现语法分析器时，我们可以利用上下文无关文法和递归下降分析的方法。在本文中，我们将按照一定的优先级处理不同类型的运算符，并将其转换为对应的语法树节点。其代码如下：

	class AST:
	pass


	class BinOp(AST):
	def __init__(self, left, op, right):
	self.left = left
	self.token = self.op = op
	self.right = right


	class Num(AST):
	def __init__(self, token):
	self.token = token
	self.value = token.value


	class String(AST):
	def __init__(self, token):
	self.token = token
	self.value = token.value


	class UnaryOp(AST):
	def __init__(self, op, expr):
	self.token = self.op = op
	self.expr = expr


	class VarDecl(AST):
	def __init__(self, var_type, var_name):
	self.var_type = var_type
	self.var_name = var_name


	class Var(AST):
	def __init__(self, token):
	self.token = token
	self.value = token.value


	class Assign(AST):
	def __init__(self, left, op, right):
	self.left = left
	self.token = self.op = op
	self.right = right


	class Compound(AST):
	def __init__(self):
	self.children = []


	class Parser:
	def __init__(self, lexer):
	self.lexer = lexer
	self.current_token = self.lexer.get_next_token()

	def error(self):
	raise Exception("Invalid syntax")

	def eat(self, token_type):
	if self.current_token.token_type == token_type:
	self.current_token = self.lexer.get_next_token()
	else:
	self.error()

	def factor(self):
	token = self.current_token
	if token.token_type == TokenType.PLUS:
	self.eat(TokenType.PLUS)
	node = UnaryOp(token, self.factor())
	return node
	elif token.token_type == TokenType.MINUS:
	self.eat(TokenType.MINUS)
	node = UnaryOp(token, self.factor())
	return node
	elif token.token_type == TokenType.INT_CONST:
	self.eat(TokenType.INT_CONST)
	return Num(token)
	elif token.token_type == TokenType.FLOAT_CONST:
	self.eat(TokenType.FLOAT_CONST)
	return Num(token)
	elif token.token_type == TokenType.CHAR_CONST:
	self.eat(TokenType.CHAR_CONST)
	return String(token)
	elif token.token_type == TokenType.LPAREN:
	self.eat(TokenType.LPAREN)
	node = self.expr()
	self.eat(TokenType.RPAREN)
	return node
	else:
	return self.variable()

	def term(self):
	node = self.factor()
	while self.current_token.token_type in (TokenType.TIMES, TokenType.DIVIDE):
	token = self.current_token
	if token.token_type == TokenType.TIMES:
	self.eat(TokenType.TIMES)
	elif token.token_type == TokenType.DIVIDE:
	self.eat(TokenType.DIVIDE)

	node = BinOp(left=node, op=token, right=self.factor())

	return node

	def expr(self):
	node = self.term()

	while self.current_token.token_type in (TokenType.PLUS, TokenType.MINUS):
	token = self.current_token
	if token.token_type == TokenType.PLUS:
	self.eat(TokenType.PLUS)
	elif token.token_type == TokenType.MINUS:
	self.eat(TokenType.MINUS)

	node = BinOp(left=node, op=token, right=self.term())

	return node

	def variable(self):
	var_token = self.current_token
	self.eat(TokenType.IDENTIFIER)
	return Var(var_token)

	def var_declaration(self):
	self.eat(TokenType.INT)
	var_type = VarDecl(TokenType.INT, None)
	var_names = [Var(self.current_token)]
	self.eat(TokenType.IDENTIFIER)

	while self.current_token.token_type == TokenType.COMMA:
	self.eat(TokenType.COMMA)
	var_names.append(Var(self.current_token))
	self.eat(TokenType.IDENTIFIER)

	self.eat(TokenType.SEMI)

	return [var_type, var_names]

	def statement(self):
	if self.current_token.token_type == TokenType.INT:
	decl_list = []
	var_decl = self.var_declaration()
	decl_list.append(VarDecl(var_decl[0], var_decl[1]))
	while self.current_token.token_type == TokenType.IDENTIFIER:
	var_decl = self.var_declaration()
	decl_list.append(VarDecl(var_decl[0], var_decl[1]))
	return decl_list

	else:
	left = self.variable()
	token = self.current_token
	self.eat(TokenType.ASSIGN)
	right = self.expr()

	node = Assign(left=left, op=token, right=right)
	return node

	def parse(self):
	node = Compound()
	while self.current_token.token_type != TokenType.EOF:
	if self.current_token.token_type == TokenType.SEMI:
	self.eat(TokenType.SEMI)
	elif self.current_token.token_type == TokenType.IDENTIFIER:
	node.children.extend(self.statement())
	else:
	node.children.append(self.expr())

	return node

语义检查器的实现

语义检查器是编译器的另一个重要模块，用于检查程序代码在语义上是否正确。在实现语义检查器时，我们需要进行类型检查、符号引用和作用域判断等操作，以保证程序在执行过程中的正确性。在本文中，我们将利用语法树对程序进行语义检查，并在发现错误时立即抛出异常。其代码如下：

	class NodeVisitor:
	def visit(self, node):
	method_name = f'visit_{type(node).__name__}'
	visitor = getattr(self, method_name, self.generic_visit)
	return visitor(node)

	def generic_visit(self, node):
	print(f"No visit_{type(node).__name__} method implemented")


	class Symbol:
	def __init__(self, name, type=None):
	self.name = name
	self.type = type


	class ScopedSymbolTable:
	def __init__(self, scope_name, scope_level, enclosing_scope=None):
	self._symbols = {}
	self.scope_name = scope_name
	self.scope_level = scope_level
	self.enclosing_scope = enclosing_scope

	def __str__(self):
	h1 = "SCOPE (SCOPED SYMBOL TABLE)"
	lines = ["\n", h1, "-" * len(h1)]
	for header_name, header_value in (
	("Scope name", self.scope_name),
	("Scope level", self.scope_level),
	("Enclosing scope", self.enclosing_scope.scope_name if self.enclosing_scope else None),
	):
	lines.append(f"{header_name:<15}: {header_value}") h2 = "Scope (Scoped symbol table) contents"
	lines.extend([h2, "-" * len(h2)])
	lines.extend(f"{key:>7}: {value}" for key, value in self._symbols.items())
	lines.append("\n")
	s = "\n".join(lines)
	return s

	def insert(self, symbol):
	self._symbols[symbol.name] = symbol

	def lookup(self, name, current_scope_only=False):
	symbol = self._symbols.get(name)
	if symbol is not None:
	return symbol

	if current_scope_only:
	return None

	if self.enclosing_scope is not None:
	return self.enclosing_scope.lookup(name)

	return None


	class SemanticAnalyzer(NodeVisitor):
	def __init__(self):
	self.current_scope = None

	def visit_Compound(self, node):
	for child in node.children:
	self.visit(child)

	def visit_BinOp(self, node):
	self.visit(node.left)
	self.visit(node.right)

	if node.op.token_type in (TokenType.PLUS, TokenType.MINUS, TokenType.TIMES, TokenType.DIVIDE):
	if node.left.type != node.right.type:
	raise Exception("Type mismatch in binary operator")

	node.type = node.left.type
	elif node.op.token_type == TokenType.ASSIGN:
	if not isinstance(node.left, Var):
	raise Exception(f"{node.left} is not assignable")

	if node.left.type != node.right.type:
	raise Exception("Type mismatch in assignment")

	def visit_Num(self, node):
	node.type = TokenType.INT

	def visit_String(self, node):
	node.type = TokenType.CHAR

	def visit_VarDecl(self, node):
	var_name = node.var_name.value
	if self.current_scope.lookup(var_name, current_scope_only=True):
	raise Exception(f"{var_name} already declared in current scope")

	self.current_scope.insert(Symbol(var_name, node.var_type))

	def visit_Var(self, node):
	var_name = node.value
	symbol = self.current_scope.lookup(var_name)
	if symbol is None:
	raise Exception(f"{var_name} is not defined")

	node.type = symbol.type

	def visit_Assign(self, node):
	self.visit(node.left)
	self.visit(node.right)

	def visit_Program(self, node):
	global_scope = ScopedSymbolTable(scope_name='global', scope_level=1, enclosing_scope=self.current_scope)
	self.current_scope = global_scope

	self.visit(node.block)

	self.current_scope = self.current_scope.enclosing_scope

目标代码生成器的实现

目标代码生成器是编译器的最后一步，其目的是将语法树转换为目标机器的汇编代码。在实现目标代码生成器时，我们需要根据目标机器的指令集和内存结构生成相应的汇编指令，以完成对程序的最终翻译。在本文中，我们将使用类C语言作为目标语言，将其翻译为x86汇编代码。并且使用Intel 8086处理器作为目标机器。

由于目标机器的处理能力限制，在实现代码生成器时，我们需要做到尽可能的优化，以达到更好的程序性能。在本文中，我们将实现一些简单的代码优化技术，例如常量折叠和寄存器分配。其代码如下：

	class X86CodeGenerator(NodeVisitor):
	def __init__(self):
	self.code = []
	self.label_count = 0
	self.data_section = [".data\n"]
	self.rodata_section = [".section .rodata\n"]
	self.bss_section = [".bss\n"]
	self.current_scope = None

	def new_label(self):
	self.label_count += 1
	return f"L{self.label_count - 1}"

	def emit(self, code):
	self.code.append(code)

	def emit_data(self, data):
	self.data_section.append(data)

	def emit_rodata(self, data):
	self.rodata_section.append(data)

	def emit_bss(self, data):
	self.bss_section.append(data)

	def visit_Compound(self, node):
	for child in node.children:
	self.visit(child)

	def visit_BinOp(self, node):
	self.visit(node.left)
	self.visit(node.right)

	if node.op.token_type == TokenType.PLUS:
	self.emit("pop ecx")
	self.emit("pop eax")
	self.emit("add eax, ecx")
	self.emit("push eax")
	elif node.op.token_type == TokenType.MINUS:
	self.emit("pop ecx")
	self.emit("pop eax")
	self.emit("sub eax, ecx")
	self.emit("push eax")
	elif node.op.token_type == TokenType.TIMES:
	self.emit("pop ecx")
	self.emit("pop eax")
	self.emit("imul ecx")
	self.emit("push eax")
	elif node.op.token_type == TokenType.DIVIDE:
	self.emit("mov edx, 0")
	self.emit("pop ecx")
	self.emit("pop eax")
	self.emit("div ecx")
	self.emit("push eax")

	def visit_Num(self, node):
	self.emit(f"push {node.value}")

	def visit_String(self, node):
	data_label = f".LC{len(self.rodata_section)}"
	self.emit(f"lea eax, dword ptr [{data_label}]")
	self.emit(f"push eax")
	self.emit_rodata(f"{data_label}: db \"{node.value}\",0")

	def visit_VarDecl(self, node):
	pass

	def visit_Var(self, node):
	symbol = self.current_scope.lookup(node.value)
	if "ebp-" in symbol.name:
	self.emit(f"lea eax, dword ptr [{symbol.name}]")
	self.emit(f"push eax")
	else:
	self.emit(f"push {symbol.name}")

	def visit_Assign(self, node):
	self.visit(node.right)
	symbol = self.current_scope.lookup(node.left.value)
	if "ebp-" in symbol.name:
	self.emit("pop eax")
	self.emit(f"mov dword ptr [{symbol.name}], eax")
	else:
	self.emit(f"mov {symbol.name}, eax")

	def visit_Program(self, node):
	self.emit(".intel_syntax noprefix")
	self.emit(".text")
	self.emit(".globl _start")
	self.emit("_start:")

	self.visit(node.block)

	self.emit("mov eax, 1")
	self.emit("xor ebx, ebx")
	self.emit("int 0x80")

	self.emit("\n")
	self.emit_data("".join(self.data_section))
	self.emit("\n")
	self.emit_rodata("".join(self.rodata_section))
	self.emit("\n")
	self.emit_bss("".join(self.bss_section))

总结

本文介绍了一个简单的编译器的实现过程，包括词法分析器、语法分析器、语义检查器和目标代码生成器。虽然这个编译器实现的语言很简单，但其实现过程可以帮助我们更好地理解编译器的工作原理，从而更好地实现更复杂的编程语言。

在实现编译器的过程中，我们发现编译器是一个功能非常强大的工具，其可以将高级语言翻译为底层机器语言，并帮助我们解决许多代码开发中的问题。同时，编译器的实现也是一个高度技术性的过程，需要对计算机系统、编程语言和编译原理等方面有深入的理解和掌握。

本文中的代码实现虽然比较简单，但其思路和实现方法可以帮助我们更好地理解编译器的工作原理。在实际的编程中，建议使用现有的编译器工具，避免自己重新造轮子。不过，对于想要深入了解编译原理的同学，可以尝试实现更复杂的编程语言编译器，以提升自己的编程技术水平。

	<program> ::= <declaration-list>
	<declaration-list> ::= <declaration> \| <declaration-list> <declaration>
	<declaration> ::= <type> <identifier-list>;
	<type> ::= int \| float \| char \| double
	<identifier-list> ::= <identifier> \| <identifier-list> , <identifier>
	<identifier> ::= <letter> \| <identifier> <letter> \| <identifier> <digit>
	<letter> ::= a \| b \| ... \| z \| A \| B \| ... \| Z \| _
	<digit> ::= 0 \| 1 \| ... \| 9