一、简述
将之前所做的爬虫案例放出,方便查阅,对代码整合函数内容并不进行说明。
二、代码
import time
from concurrent.futures import ThreadPoolExecutorimport requests
import re
import csv
from bs4 import BeautifulSoup
import os
from lxml import etreedef top250():
#re实例1for a in range(0,250,25):url="https://movie.douban.com/top250?start={0}".format(a)#print(url)headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"}resp = requests.get(url,headers=headers)#get模式带UA防反爬page_content=resp.textobj=re.compile(r'<li>.*?<em class="">(?P<ranking>.*?)</em>'r'.*?<span class="title">(?P<name>.*?)</span>.*?'r'.*?<br>(?P<year>.*?) .*?'r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'r'.*?<span>(?P<number>.*?)</span>',re.S)result= obj.finditer(page_content)f=open("DoubanTop250.csv",mode="a")csvwriter=csv.writer(f)for i in result:# print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format(# i.group("ranking"),# i.group("name"),# i.group("year").strip(),# i.group("score"))# )dic=i.groupdict()dic['year']=dic['year'].strip()csvwriter.writerow(dic.values())f.close()time.sleep(2)#多个页面采用时延防反爬,不然ip会被ban掉print("收集到{0}个信息".format(a+25))def MovieDownload():
#re实例2domain ="https://dytt89.com/"resp = requests.get(domain,verify=False)#get中特殊的verify=False处理resp.encoding='gbk'#国标语言#print(resp.text)f = open("Dytt2022新片精品电影下载地址.csv", mode="a")csvwriter = csv.writer(f)obj1=re.compile(r'2022新片精品.*?'r'<ul>(?P<ul>.*?)</ul>',re.S)obj2=re.compile(r'''<li><a href='(?P<href>.*?)' title="''',re.S)obj3=re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1></div>.*?'r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">'r'<a href="(?P<download>.*?)">',re.S)child_href_list=[]result1=obj1.finditer(resp.text)for i in result1:#第一层————获取主页面代码ul=i.group('ul')#print(ul)#time.sleep(1)result2=obj2.finditer(ul)for j in result2:#第二层————获取进入后的代码#time.sleep(1)child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接child_href_list.append(child_href)k=0for href in child_href_list:child_resp=requests.get(href,verify=False)child_resp.encoding='gbk'result3=obj3.search(child_resp.text)#获取所需的下载内容#print(result3.group('movie'))#print(result3.group('download'))dic = result3.groupdict()csvwriter.writerow(dic.values())k=k+1#计数print("已收集到{0}个电影".format(k))f.close()def VegetableValue():#POST实例url = "http://www.xinfadi.com.cn/getPriceData.html"#由于页面源代码没有数据,所以F12用抓包工具获取到数据的链接使用即可head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"}f = open("菜价.csv", mode="a")csvwriter = csv.writer(f)data = {'limit': 3} # 所要统计的数据数量#get和post获取数据的区别#get只能直接获取当前页面内所有资源#post可以控制所要获取的资源数量data#resp = requests.get(url, headers=head).json()resp = requests.post(url, headers=head, data=data).json()lis = resp.get('list')for i in lis:name = i.get("prodName")low_price = i.get("lowPrice")high_price = i.get("highPrice")average_price = i.get("avgPrice")producing_area = i.get("place")unit = i.get("unitInfo")date = i.get("pubDate")csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date])f.close()def CatchPicture(url):#bs4实例url_download = "https://pic.netbian.com/"resp = requests.get(url)resp.encoding="gbk"#print(resp.text)main_page=BeautifulSoup(resp.text,"html.parser")alist=main_page.find("div",class_="slist").find_all("a")#print(alist)for a in alist:#print(a.get('href'))href = url_download + a.get('href').strip("/") # 主页面代码与子页面的特殊部分拼接#print(href)child_page_resp=requests.get(href)child_page_resp.encoding='gbk'child_page_text=child_page_resp.textchild_page=BeautifulSoup(child_page_text,"html.parser")img = child_page.find("div",class_="photo").find("img")img_name= child_page.find("div",class_="photo").find("img").get("title")#print(img.get("src"))src=url_download+img.get("src").strip("/")#print(src)#print(img_name)img_resp=requests.get(src)img = img_resp.content#拿到字节with open("img2/"+img_name+".jpeg",mode="wb")as f:f.write(img)print(img_name+"下载好了!!")#breaktime.sleep(0.5)#防反爬必要时延def Xpath():#网站不知道为啥总是返回空列表找不到数据,但是自己做的html却很轻松能访问到节点目标tree = etree.parse('file:///C:/Users/86183/Desktop/1.html')r1 = tree.xpath('/html/body/div[2]/p') # 直接从上往下挨着找节点# /html/body/div[2]/p[1]for div in r1:# /html/body/div[2]/p[1]a = div.xpath('./text()')print(a)#浏览器中的console调用xpath的基本格式:$x("xpath表达式"),若格式正确则返回值,错误则无def Vidio():#梨视频防盗链破解url = "https://www.pearvideo.com/video_1733893"#拉取视频地址contId = url.split("_")[1]#拿到1733893resp = requests.get(url)resp.encoding="utf-8"#print(resp.text)main_page = BeautifulSoup(resp.text,"html.parser")title = main_page.find("div",class_="box-left clear-mar").find("h1").text#print(title)header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"# referer防盗链:溯源,访问顺序必须是1->2->3,所以加个referer模拟该状态;若是只有1->3则失效,"Referer": url}vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109"resp = requests.get(vidio_status,headers=header)#print(resp.text)dic = resp.json()#print(dic)srcUrl = dic["videoInfo"]["videos"]['srcUrl']systemTime = dic['systemTime']#systemTime:1660186591481# 假:https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4# 真:https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分#print(srcUrl_true)with open("videos/"+title+".mp4",mode= "wb")as f:f.write(requests.get(srcUrl_true).content)print(title+"下载完成!")def aiodownload(cid,title,book):url = f"https://www.23qb.com/book/{cid}.html"page = 1with open(f"novels/{book}.txt", mode="a+") as f:f.write("\n")f.write("\n"+title+"\n")f.write("\n")while True:resp = requests.get(url).textpage_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser")lists = page_thing.find_all("div", class_="read-content")for texts in lists:text = texts.find_all('p')del text[-1]if text[-1].string == "(继续下一页)":del text[-1]page = page+1url = f"https://www.23qb.com/book/{cid}_{page}.html"for line in text:txt = line.stringtry:f.write(txt+"\n")except Exception as e:f.write("!!!!!!!!"+"\n")continuecontinueelse:for line in text:txt = line.stringtry:f.write(txt + "\n")except Exception as e:f.write("!!!!!!!!" + "\n")continuebreakprint(title + "下载完成")def getCatalog(url):resp = requests.get(url)#print(resp.text)obj1 = re.compile(r'<meta property="og:novel:book_name" content="(?P<book>.*?)"/>.*?'r'<ul class="chaw_c" id="chapterList">(?P<url>.*?)</ul>',re.S)obj2 = re.compile(r'<li><a href="/book/(?P<c_id>.*?).html">(?P<name>.*?).</a></li>',re.S)main_page = resp.textresult = obj1.finditer(main_page)for i in result:ul = i.group('url')book = i.group("book")#print(ul)result2= obj2.finditer(ul)for ii in result2:cid=ii.group("c_id")title = ii.group("name")aiodownload(cid,title,book)print(book+"下载完成!")if __name__ == '__main__':#top250()#MovieDownload()#VegetableValue()# CatchPicture主函数'''start_time = time.time()with ThreadPoolExecutor(10) as t:#这里只开启了20个线程,可以更多for i in range (2,119):t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html")time.sleep(1)#多线程记得时延防止反爬禁ip(已经被网站禁了四个ip了谢邀)print(f"第{i}页内容下载完毕")#如果控制台只显示此内容,则说明ip已经被网站反爬print("全部下载完毕")end_time = time.time()print('总共的时间为:', round(end_time - start_time, 2), '秒')'''# Xpath()#Vidio()# 小说下载start_time = time.time()b_id = "116418" #input("输入你想下载的书的id:")#"60218","27309","4286","719","189697"url =f"https://www.23qb.com/book/{b_id}/"getCatalog(url)end_time = time.time()print("下载时间为:", round(end_time - start_time, 2), '秒')