网页文本的文本预处理
import re
class Tool:removeImg = re.compile('<img.*?>| {7}|')removeAddr = re.compile('<a.*?>|</a>')replaceLine = re.compile('<tr>|<div>|</div>|</p>')replaceTD = re.compile('<td>')replacePara = re.compile('<p.*?>')replaceBR = re.compile('<br><br>|<br>')removeExtraTag = re.compile('<.*?>')removeSpan = re.compile('<span(" ")+</span>')replaceN = re.compile("(\n)+(\t)?")special_symbol = re.compile("\u3000\u3000|\xa0|\xa0|\u3000\u3000")def replace(self, x):re.sub(self.special_symbol, "", x)x = re.sub(self.removeImg, "", x)x = re.sub(self.removeAddr, "", x)x = re.sub(self.replaceLine, "\n", x)x = re.sub(self.replaceTD, "\t", x)x = re.sub(self.replacePara, "\n", x)x = re.sub(self.replaceBR, "\n", x)x = re.sub(self.removeExtraTag, "", x)x = re.sub(self.removeSpan, "", x)x = re.sub(self.replaceN, "\n", x)return x.strip()
tool = Tool()
sub_content = tool.replace(s)