第一题

核心代码与运行结果:

点击查看代码
import requests
from bs4 import BeautifulSoupurl = "http://www.shanghairanking.cn/rankings/bcur/2020"resp = requests.get(url)
resp.encoding = resp.apparent_encoding
html = resp.textsoup = BeautifulSoup(html, 'html.parser')# 找到表格主体
tbody = soup.find('tbody')# 获取所有行
tr_list = tbody.find_all('tr')# 存储数据
data = []
for tr in tr_list:# 获取行中的所有单元格td_list = tr.find_all('td')# 检查是否有足够的单元格if len(td_list) < 5:continue# 提取排名信息rank_div = td_list[0].find('div')rank = rank_div.get_text().strip() if rank_div else ""# 提取学校名称name_span = td_list[1].find('span', class_='name-cn')name = name_span.get_text().strip() if name_span else ""# 提取省份信息province = td_list[2].get_text().strip()# 提取学校类型school_type = td_list[3].get_text().strip()# 提取总分total_score = td_list[4].get_text().strip()# 确保所有字段都有值后添加到数据列表if all([rank, name, province, school_type, total_score]):data.append({'rank': rank,'name': name,'province': province,'type': school_type,'score': total_score})# 打印表头和数据
print(f"{'排名':<4} {'学校名称':<15} {'省市':<4} {'学校类型':<6} {'总分':<4}")
print("-" * 45)
for row in data:print(f"{row['rank']:<4} {row['name']:<15} {row['province']:<4} {row['type']:<6} {row['score']:<4}")
![1bfce4214d157785d7381e2d2abc7e61](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026115821795-2101174164.png) ![0cdfea33-3f09-4778-8aa3-262dcfffe805](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026132644825-58360209.png)

心得体会:
对要爬取的数据检查就可以很快速地定位到主体表格,最主要的就是找到网页编写html的规律,这样就可以很快速地了解每个排名数据的提取方法

第二题

核心代码与运行结果:

点击查看代码
import re
import requests
import html  url = "https://search.dangdang.com/?key=%CA%E9%B0%FC&category_id=10009684#J_tab"resp = requests.get(url)
resp.encoding = "gbk"  
html_text = resp.text  # 存储数据
data = []# 匹配商品列表的ul标签
ul = r'<ul[^>]*id="component_59"[^>]*>(.*?)</ul>'
ul_match = re.search(ul, html_text, re.S)  if ul_match:ul_note = ul_match.group(1)  # 获取ul标签内的内容# 匹配ul中的所有li标签(商品项)li_ = r'<li[^>]*id="\d+"[^>]*>(.*?)</li>'li_list = re.findall(li_, ul_note, re.S)# 遍历每个商品项for li_each in li_list:# 提取商品名称name_match = re.search(r'<a[^>]*title="([^"]*)"[^>]*name="itemlist-title"', li_each, re.S)name = name_match.group(1).strip() if name_match else ""# 提取商品价格price_match = re.search(r'<span class="price_n">\s*(.*?)\s*</span>', li_each, re.S)price = price_match.group(1).strip() if price_match else ""# 将&yen;转换为¥price = html.unescape(price)if all([price, name]):data.append({'price': price,'name': name})print(f"\n最终提取到 {len(data)} 个商品")
for row in data:print(f"{row['price']:<10} {row['name']:<30}")
![5e31544549fc4c2e362db94140f77e66](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026145208608-941414735.png) ![d20ea28a75cd039081baf2220f3c4596](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026145212924-606181751.png) **心得体会:** 定位的方法和第一题相同,但是要使用正则表达式匹配还是得写得精确一点,不然很容易匹配到犄角旮旯里去,导致爬取失败

第三题

核心代码与运行结果:

点击查看代码
import urllib.request
import re
import os
import time
from urllib.parse import urljoin, urlparsedef get_page_content(url):try:headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}req = urllib.request.Request(url, headers=headers)resp = urllib.request.urlopen(req, timeout=10)html_content = resp.read().decode('utf-8')return html_contentexcept Exception as e:print(f"获取页面失败: {e}")return Nonedef get_image_urls(html_content, page_url):# 匹配img标签中的src属性img_pattern = r'<img[^>]+src="([^">]+)"'img_urls = re.findall(img_pattern, html_content, re.I)# 匹配CSS背景图片css_pattern = r'background-image:\s*url\([\'"]?([^\'"\)]+)[\'"]?\)'css_urls = re.findall(css_pattern, html_content, re.I)# 合并所有图片URLall_urls = img_urls + css_urlsvalid_image_urls = []# 将相对路径转换为绝对路径for img_url in all_urls:full_url = urljoin(page_url, img_url)valid_image_urls.append(full_url)return valid_image_urlsdef download_image(img_url, download_folder):try:headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36','Referer': 'https://news.fzu.edu.cn/'}# 请求图片req = urllib.request.Request(img_url, headers=headers)response = urllib.request.urlopen(req, timeout=10)img_data = response.read()# 从URL中提取文件名filename = os.path.basename(urlparse(img_url).path)if not filename:filename = f"image_{int(time.time())}.jpg"filepath = os.path.join(download_folder, filename)with open(filepath, 'wb') as f:f.write(img_data)print(f"下载成功: {filename}")return Trueexcept Exception as e:print(f"下载失败: {e}")return Falsedef main():url = "https://news.fzu.edu.cn/yxfd.htm"download_folder = "fzu_images"if not os.path.exists(download_folder):os.makedirs(download_folder)html_content = get_page_content(url)if not html_content:returnimage_urls = get_image_urls(html_content, url)print(f"找到 {len(image_urls)} 张图片")# 下载图片downloaded_count = 0for img_url in image_urls:if download_image(img_url, download_folder):downloaded_count += 1time.sleep(0.5)print(f"下载完成! 共下载 {downloaded_count} 张图片")if __name__ == "__main__":main()
![b047fde6ca1a08b10d382e05a7eeae58](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026152822402-1896520434.png) ![452c1e61bfb8c75a41afd74f18aaea7d](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026152826284-1429015007.png) ![7f5ac0568b80d297205c52f6e4eaf198](https://img2024.cnblogs.com/blog/3714501/202510/3714501-20251026155050720-6709223.png) **心得体会:** 基本步骤都相差无几,主要难点就是从html里提取到图片的链接,通过元素面板搜索页面内的img标签,再提取其url就可以下载,css背景图也是同一个道理

Gitee仓库路径:
https://gitee.com/wudilecl/2025_crawl

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/946839.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!