算法描述
对于给出的源代码,我们按行将其读入,对于每一行单独进行词法分析。
- 过滤行前后空格
- 对字符串进行词语的分割 - 有空格则把空格前的字符归为一个词
- 比较上一个字符和当前字符是否需要进行分割
 
- 检查词语是否合法
- 词语合法则按 [待测代码中的单词符号] [TAB] <[单词符号种别],[单词符号内容]> 进行输出,其中,单词符号种别为 KW(关键字)、OP(运算符)、SE(界符)、IDN(标识符)INT(整形数);单词符号内容 KW、OP、SE 为其编号(见单词表),其余为其值。
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <sstream>using namespace std;const int WORD_NUM = 26;
const string WORD[WORD_NUM] = {"int", "void", "return", "const", "main", "struct", "+",  "-",  "*",  "/","%",   "=",    ">",      "<",     "==",   "<=", ">=", "!=", "&&","||",  "(",    ")",      "{",     "}",    ";",  ",",
};
const string OPERATOR = "+-*/%><=&|";
const string SEPARATER = "(){};,[]";
int kws = 0, kwe = 6, ops = 6, ope = 20, ses = 20, see = 26;class Analyzer {private:vector<string> lines;vector<string> token;string fileName;ofstream fout;int isWord(string word) {for(int i = 0; i < WORD_NUM; i++) {if(word == WORD[i])return i;}return -1;}bool isKeyWord(int idx) {return kws <= idx && idx < kwe;}bool isOperator(int idx) {return ops <= idx && idx < ope;}bool isOperator(char ch) {return OPERATOR.find(ch) != OPERATOR.npos;}bool isSeparater(int idx) {return ses <= idx && idx < see;}bool isSeparater(char ch) {return SEPARATER.find(ch) != SEPARATER.npos;}inline bool isNumber(char ch) {return ch >= '0' && ch <='9';}bool isInt(string word) {for(int i = 0; i < word.size(); i++) {if(!isNumber(word[i]))return false;}return true;}inline bool isCharacter(char ch) {return ch >= 'a' && ch <= 'z' || ch >='A' && ch <= 'Z';}bool isPartOfIdentifier(char c) {return isCharacter(c) || isNumber(c) || c == '_';}bool isIdentifier(string word) {if(isNumber(word[0])) {return false;}for(int i = 1; i < word.size(); i++) {if(!isPartOfIdentifier(word[i]))return false;}return true;}//输出inline void record(string word, string type, string content) {char TAB = '\t';string msg = word + TAB + "<" + type + "," + content + ">";fout << msg << endl;token.push_back(msg);}//int 转 stringstring to_string(int val) {stringstream ss;ss << val;string result;ss >> result;return result;}//分析一个单词bool anaylyseWord(string word) {if(word.empty()) {return true;}int idx = isWord(word);if(idx > -1) {string type;if(isKeyWord(idx)) type = "KW";if(isOperator(idx)) type = "OP";if (isSeparater(idx)) type = "SE";record(word, type, to_string(idx + 1));return true;} else {if(isIdentifier(word)) {record(word,"IND", word);return true;}if(isInt(word)) {record(word,"INT", word);return true;}}fout << "ERROR detected!" << endl;cout << "ERROR detected!" << endl;return false;}//去除字符串前后空格string trim(string s) { if(s == "") {return "";}int l = 0, r = s.size() - 1;while(s[l] == ' ' && l < s.size()) l++;while(s[r] == ' ' && r > l) r--;return s.substr(l,r + 1);}//判断两个相邻字符是否需要分割bool check(char a, char b) {if ((isOperator(a) && !isOperator(b)) ||(!isOperator(a) && isOperator(b)) || isSeparater(a) ||(!isSeparater(a) && isSeparater(b)))return false;return true;}public:Analyzer(string fileName) {readFile(fileName);}~Analyzer() {fout.close();}vector<string> getToken() {return token;}void readFile(string fileName) {this->fileName = fileName;fstream fin(fileName.c_str());if (!fin.is_open()) {throw "无法打开文件";}string line;while (getline(fin, line)) {line = trim(line);if(!line.empty())lines.push_back(line);}fin.close();// fout.open("token.txt");fout.open(fileName.substr(0,fileName.find_last_of(".")) + ".out");}void analyse() {int l = 0;string word = "";while(l < lines.size()) {string line = lines[l++]; //读入一行word.clear();for(int i = 0; i < line.size(); i++) {if(line[i] == ' ' || line[i] == '\t') { //分割单词if(!anaylyseWord(word)) return; //判断单词是否合法并打印word.clear();continue;}if(!check(word[word.size() - 1], line[i])) { //分割单词if(!anaylyseWord(word)) return; //判断单词是否合法并打印word.clear();}word += line[i]; }anaylyseWord(word); //到行末结束后,将剩余的拼成一个单词}}
};int main() {try {Analyzer analyzer("a.sy");analyzer.analyse();system("pause");} catch (const char *msg) {cout << msg << endl;}return 0;
}
算法NFA和DFA及单词表

 
| 单词符号 | 种类 | 种别码 | 
|---|---|---|
| int | 关键字 | 1 | 
| void | 关键字 | 2 | 
| return | 关键字 | 3 | 
| const | 关键字 | 4 | 
| main | 关键字 | 5 | 
| struct | 关键字 | 6 | 
| + | 运算符 | 7 | 
| - | 运算符 | 8 | 
| * | 运算符 | 9 | 
| / | 运算符 | 10 | 
| % | 运算符 | 11 | 
| = | 运算符 | 12 | 
| < | 运算符 | 13 | 
| > | 运算符 | 14 | 
| == | 运算符 | 15 | 
| <= | 运算符 | 16 | 
| >= | 运算符 | 17 | 
| != | 运算符 | 18 | 
| && | 运算符 | 19 | 
| || | 运算符 | 20 | 
| ( | 界符 | 21 | 
| ) | 界符 | 22 | 
| { | 界符 | 23 | 
| } | 界符 | 24 | 
| ; | 界符 | 25 | 
| , | 界符 | 26 |