python爬虫爬取中国国际招标有限公司
python爬取数据存储到redis中
import re
from datetime import datetimeimport redis
import requests
from lxml import etree
from config import REDIS_IP, REDIS_PORT, REDIS_DB, REDIS_PASSWORD
from items. sql import MySQLclass Cntcitc : def __init__ ( self) : self. redis = redis. Redis( host= REDIS_IP, port= REDIS_PORT, db= REDIS_DB, password= REDIS_PASSWORD, decode_responses= True , charset= 'UTF-8' , encoding= 'UTF-8' ) self. db = MySQL( ) self. db. connect( ) self. name = '中国国际招标有限公司' self. url = 'https://www.cntcitc.com.cn/searchPage.html' self. api_url = 'https://www.cntcitc.com.cn/search.html' self. today = datetime. today( ) . strftime( '%Y-%m-%d' ) self. counter_key = f"cntcitc:counter: { self. today} " self. overall_cycle = False self. headers = { "referer" : self. url, "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36" , } def get_data ( self, key, page= 1 ) : payload = { 'channelId' : '-1' , 'key' : key, 'startTime' : "2024-06-18" , 'endTime' : '' , 'currentPage' : page} con = requests. post( url= self. api_url, headers= self. headers, data= payload) . content. decode( 'utf8' ) html = etree. HTML( con) content_text = '' . join( html. xpath( '/html/body/div/div/form/div[2]/ul/text()' ) ) content = content_text. strip( ) print ( f"key: { key} ,爬取内容: { content} " ) if content == "未查询到相关内容" : return None else : return htmldef get_page ( self, key) : html = self. get_data( key) if html is not None : pageText = '' . join( html. xpath( '/html/body/div/div/form/div[2]/div/span[2]/text()' ) ) match = re. search( r"共\d+页" , pageText) if match : cleaned_text = re. sub( r"\s" , "" , match . group( ) ) page = re. search( r"\d+" , cleaned_text) . group( ) else : page = None return pageelse : return None def spider ( self, key) : pages = self. get_page( key) if pages is not None : self. overall_cycle = False last_page_key = f"cntcitc:last_link: { key} " last_page_link = str ( self. redis. get( last_page_key) or "" ) try : for page in range ( 1 , int ( pages) + 1 ) : if self. overall_cycle: break html = self. get_data( key, page) if html is not None : for i in range ( 1 , 16 ) : title = '' . join( html. xpath( f'/html/body/div/div/form/div[2]/ul/li[ { i} ]/a/text()' ) ) . strip( ) if title == "" : break suffix_link = '' . join( html. xpath( f'/html/body/div/div/form/div[2]/ul/li[ { i} ]/a/@href' ) ) link = f"https://www.cntcitc.com.cn/ { suffix_link} " if last_page_link == link: self. overall_cycle = True break publish_time_text = '' . join( html. xpath( f'/html/body/div/div/form/div[2]/ul/li[ { i} ]/text()' ) ) match = re. search( r'\d{4}-\d{2}-\d{2}' , publish_time_text) publish_time = "" if match : date_str = match . group( ) publish_time = date_strself. store_to_redis( link, title, publish_time, key) if last_page_link == "" : self. redis. set ( last_page_key, link) last_page_link = linkexcept Exception as e: print ( f"中国国际招标有限公司爬虫出现异常: { e} " ) self. redis. set ( last_page_key, "" ) def store_to_redis ( self, link, title, show_times, key) : if self. redis. exists( link) : existing_keys = self. redis. hget( link, 'keys' ) . split( ',' ) if key not in existing_keys: existing_keys. append( key) self. redis. hset( link, 'keys' , ',' . join( existing_keys) ) self. redis. hset( link, 'is_synced' , 'false' ) else : self. redis. hset( link, mapping= { 'title' : title, 'show_times' : show_times, 'keys' : key, 'is_synced' : 'false' } ) self. redis. expire( link, 2419200 ) self. redis. incr( self. counter_key) def get_today_crawl_count ( self) : return int ( self. redis. get( self. counter_key) or 0 ) def process ( self) : key_list = [ '动漫' , '引流' , '银行' , '业务' ] for key in key_list: self. spider( key) print ( f'中国国际招标有限公司的爬取数据数量为: { self. get_today_crawl_count( ) } ' ) if __name__ == '__main__' : bank_cntcitc = Cntcitc( ) bank_cntcitc. process( )