python爬虫爬取中国国际招标有限公司

python爬取数据存储到redis中

#  -*- coding: utf-8 -*-
# 中国国际招标有限公司
import re
from datetime import datetimeimport redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items.sql import MySQLclass Cntcitc:def __init__(self):self.redis = redis.Redis(host=REDIS_IP, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD,decode_responses=True, charset='UTF-8',encoding='UTF-8')self.db = MySQL()self.db.connect()self.name = '中国国际招标有限公司'self.url = 'https://www.cntcitc.com.cn/searchPage.html'self.api_url = 'https://www.cntcitc.com.cn/search.html'self.today = datetime.today().strftime('%Y-%m-%d')self.counter_key = f"cntcitc:counter:{self.today}"self.overall_cycle = Falseself.headers = {"referer": self.url,"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",}def get_data(self, key, page=1):payload = {'channelId': '-1','key': key,'startTime': "2024-06-18",'endTime': '','currentPage': page}con = requests.post(url=self.api_url, headers=self.headers, data=payload).content.decode('utf8')html = etree.HTML(con)content_text = ''.join(html.xpath('/html/body/div/div/form/div[2]/ul/text()'))content = content_text.strip()print(f"key:{key},爬取内容：{content}")if content == "未查询到相关内容":return Noneelse:return html# 获取总页数def get_page(self, key):html = self.get_data(key)if html is not None:pageText = ''.join(html.xpath('/html/body/div/div/form/div[2]/div/span[2]/text()'))# 使用正则表达式匹配“共x页”格式的文本match = re.search(r"共\d+页", pageText)# 如果匹配成功，去除中间空格并提取数字if match:# 去除中间空格cleaned_text = re.sub(r"\s", "", match.group())# 提取数字page = re.search(r"\d+", cleaned_text).group()else:page = Nonereturn pageelse:return Nonedef spider(self, key):pages = self.get_page(key)if pages is not None:self.overall_cycle = False# 爬取增量数据last_page_key = f"cntcitc:last_link:{key}"last_page_link = str(self.redis.get(last_page_key) or "")try:for page in range(1, int(pages) + 1):if self.overall_cycle:breakhtml = self.get_data(key, page)if html is not None:for i in range(1, 16):title = ''.join(html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/text()')).strip()if title == "":breaksuffix_link = ''.join(html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/a/@href'))link = f"https://www.cntcitc.com.cn/{suffix_link}"if last_page_link == link:self.overall_cycle = Truebreakpublish_time_text = ''.join(html.xpath(f'/html/body/div/div/form/div[2]/ul/li[{i}]/text()'))# 使用正则表达式匹配日期match = re.search(r'\d{4}-\d{2}-\d{2}', publish_time_text)publish_time = ""if match:date_str = match.group()publish_time = date_strself.store_to_redis(link, title, publish_time, key)if last_page_link == "":self.redis.set(last_page_key, link)last_page_link = linkexcept Exception as e:print(f"中国国际招标有限公司爬虫出现异常： {e}")self.redis.set(last_page_key, "")def store_to_redis(self, link, title, show_times, key):if self.redis.exists(link):existing_keys = self.redis.hget(link, 'keys').split(',')if key not in existing_keys:existing_keys.append(key)self.redis.hset(link, 'keys', ','.join(existing_keys))self.redis.hset(link, 'is_synced', 'false')else:self.redis.hset(link, mapping={'title': title,'show_times': show_times,'keys': key,'is_synced': 'false'})# 设置过期时间为28天（2419200秒）self.redis.expire(link, 2419200)self.redis.incr(self.counter_key)def get_today_crawl_count(self):return int(self.redis.get(self.counter_key) or 0)def process(self):key_list = ['动漫', '引流', '银行', '业务']for key in key_list:self.spider(key)print(f'中国国际招标有限公司的爬取数据数量为：{self.get_today_crawl_count()}')if __name__ == '__main__':bank_cntcitc = Cntcitc()bank_cntcitc.process()