主要是分析网站图片的html源代码,来决定正则表达式如何写。
完整代码
#使用正则表达式爬取多张图片,亮点在于数据解析
#爬取网站:https://www.bilibili.com/read/cv11323037?from=search
import requests
import re
import os
image_path='image'
if not os.path.exists(image_path):os.makedirs(image_path)url='https://www.bilibili.com/read/cv11323037?from=search'
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36 Edg/91.0.864.48'
}
r=requests.get(url=url,headers=header).text
with open('1.html','w',encoding='utf-8') as f:f.write(r)
#使用正则表达式数据解析
#<img data-src="//i0.hdslb.com/bfs/article/45cb84438212c280a5cc22dc6243d4d662a2a535.jpg" width="992" height="700" data-size="226676" class="normal-img" data-index="0" data-type="preview" style="width: 628px; height: 444px;" src="//i0.hdslb.com/bfs/article/45cb84438212c280a5cc22dc6243d4d662a2a535.jpg@785w_555h_progressive.webp">
pattern='<img data-src="(.*?)" width='
list=re.findall(pattern,r,re.S)
print(list)
#解析完毕
for src in list:image_url='https:'+srcr=requests.get(url=image_url,headers=header).contentimage_name=src.split('/')[-1]with open(os.path.join(image_path,image_name),'wb') as f:f.write(r)print("{}已爬取完毕".format(image_name))