数据采集作业3 102302111 海米沙

news/2025/11/23 23:20:51/文章来源:https://www.cnblogs.com/hamisa/p/19261669

第三次作业
作业1:
要求：指定一个网站，爬取这个网站中的所有的所有图片，例如：中国气象网（http://www.weather.com.cn）。实现单线程和多线程的方式爬取。
–务必控制总页数（学号尾数2位）、总下载的图片数量（尾数后3位）等限制爬取的措施。
输出信息: 将下载的Url信息在控制台输出，并将下载的图片存储在images子文件中，并给出截图。
Gitee文件夹链接：https://gitee.com/haimisha/2025_creat_project/tree/master/作业三/第一题

代码（单线程）：

点击查看代码

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoinIMAGE_DIR = r"D:\数据采集\实践课\实验作业三\images"
os.makedirs(IMAGE_DIR, exist_ok=True)# 控制：总页数 11，总图片数 111
MAX_PAGES = 11              # 总共爬取 11 个页面
MAX_IMAGES = 111            # 总共下载 111 张图片
downloaded_count = 0TARGET_PAGES = ["http://www.weather.com.cn/","http://www.weather.com.cn/weather/101010100.shtml",  # 北京"http://www.weather.com.cn/weather/101020100.shtml",  # 上海"http://www.weather.com.cn/weather/101280101.shtml",  # 广州"http://www.weather.com.cn/weather/101270101.shtml",  # 成都"http://www.weather.com.cn/news/","http://www.weather.com.cn/jrxw/","http://www.weather.com.cn/zt/","http://www.weather.com.cn/satellite/","http://www.weather.com.cn/radar/","http://www.weather.com.cn/weather40d/101010100.shtml",  # 北京40天
]def download_image(img_url):global downloaded_counttry:resp = requests.get(img_url, stream=True, timeout=10)if resp.status_code == 200:ext = img_url.split('.')[-1].split('?')[0].lower()if ext not in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']:ext = 'jpg'  # 默认filename = f"image_{downloaded_count + 1}.{ext}"filepath = os.path.join(IMAGE_DIR, filename)with open(filepath, 'wb') as f:for chunk in resp.iter_content(1024):f.write(chunk)print(f"[下载成功] {img_url} -> {filename}")downloaded_count += 1return Trueexcept Exception as e:print(f"[下载失败] {img_url} : {e}")return Falsedef scrape_images_from_page(url):global downloaded_countif downloaded_count >= MAX_IMAGES:returnprint(f"\n 正在访问页面: {url}")try:headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}resp = requests.get(url, headers=headers, timeout=10)soup = BeautifulSoup(resp.text, 'html.parser')imgs = soup.find_all('img')for img in imgs:src = img.get('src')if not src:continuesrc = urljoin(url, src)  # 补全相对链接if any(ext in src.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']):if download_image(src):if downloaded_count >= MAX_IMAGES:print("已达到目标图片数量 111，停止下载。")returnexcept Exception as e:print(f"[页面出错] {url} : {e}")def main():for i, page_url in enumerate(TARGET_PAGES):if downloaded_count >= MAX_IMAGES:breakprint(f"\n正在处理第 {i+1}/{MAX_PAGES} 个页面")scrape_images_from_page(page_url)print(f"\n共下载了 {downloaded_count} 张图片，保存在 {IMAGE_DIR}")if __name__ == '__main__':main()

运行结果：

代码（多线程）：

点击查看代码

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor, as_completedIMAGE_DIR = r"D:\数据采集\实践课\实验作业三\images"
os.makedirs(IMAGE_DIR, exist_ok=True)# 爬取限制
MAX_PAGES = 11              # 总共爬取 11 个页面
MAX_IMAGES = 111            # 总共下载 111 张图片
downloaded_count = 0TARGET_PAGES = ["http://www.weather.com.cn/",                         # 首页"http://www.weather.com.cn/weather/101010100.shtml",  # 北京"http://www.weather.com.cn/weather/101020100.shtml",  # 上海"http://www.weather.com.cn/weather/101280101.shtml",  # 广州"http://www.weather.com.cn/weather/101270101.shtml",  # 成都"http://www.weather.com.cn/news/",                    # 天气新闻"http://www.weather.com.cn/jrxw/",                    # 气象新闻"http://www.weather.com.cn/zt/",                      # 专题"http://www.weather.com.cn/satellite/",               # 卫星云图（可能有图）"http://www.weather.com.cn/radar/",                   # 雷达图"http://www.weather.com.cn/weather40d/101010100.shtml",  # 北京40天预报
]def download_image(img_url):global downloaded_countif downloaded_count >= MAX_IMAGES:return Nonetry:headers = {'User-Agent': 'Mozilla/5.0'}resp = requests.get(img_url, stream=True, headers=headers, timeout=10)if resp.status_code == 200:# 判断图片格式ext = img_url.split('.')[-1].split('?')[0].lower()if ext not in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp']:ext = 'jpg'  # 默认filename = f"image_{downloaded_count + 1}.{ext}"filepath = os.path.join(IMAGE_DIR, filename)with open(filepath, 'wb') as f:for chunk in resp.iter_content(1024):f.write(chunk)print(f"[多线程下载成功] {img_url} -> {filename}")downloaded_count += 1return img_urlexcept Exception as e:print(f"[多线程下载失败] {img_url}: {e}")return Nonedef extract_and_queue_images_from_page(page_url):global downloaded_countif downloaded_count >= MAX_IMAGES:return []print(f"\n [多线程] 正在访问页面: {page_url}")try:headers = {'User-Agent': 'Mozilla/5.0'}resp = requests.get(page_url, headers=headers, timeout=10)soup = BeautifulSoup(resp.text, 'html.parser')img_tags = soup.find_all('img')urls = []for img in img_tags:src = img.get('src')if not src:continuefull_url = urljoin(page_url, src)if any(ext in full_url.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']):urls.append(full_url)return urlsexcept Exception as e:print(f"[多线程页面错误] {page_url}: {e}")return []def main():global downloaded_countall_image_urls = []# Step 1: 从 11 个页面提取所有图片 URLfor page_url in TARGET_PAGES:if downloaded_count >= MAX_IMAGES:breakprint(f"\n [多线程] 正在处理第 {TARGET_PAGES.index(page_url)+1}/{MAX_PAGES} 个页面: {page_url}")urls = extract_and_queue_images_from_page(page_url)all_image_urls.extend(urls)# Step 2: 去重all_image_urls = list(set(all_image_urls))  # 去重print(f"总共提取到 {len(all_image_urls)} 张候选图片，准备下载...")# Step 3: 多线程下载（最多下载 MAX_IMAGES 张）with ThreadPoolExecutor(max_workers=10) as executor:futures = []for url in all_image_urls:if downloaded_count >= MAX_IMAGES:breakfutures.append(executor.submit(download_image, url))# 等待所有任务完成（可选，用于调试）for future in as_completed(futures):future.result()  # 可拿到返回值，这里只为了等待print(f"\n[多线程] 下载完成！总共下载了 {downloaded_count} 张图片，保存在：{IMAGE_DIR}")if __name__ == '__main__':main()

运行结果：

作业2：
要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站：东方财富网：https://www.eastmoney.com/

输出信息：MySQL数据库存储和输出格式如下：
表头英文命名例如：序号id，股票代码：bStockNo……，由同学们自行定义设计
Gitee文件夹链接：https://gitee.com/haimisha/2025_creat_project/tree/master/作业三/第一题

代码：
items.py

点击查看代码

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass EastmoneyProjectItem(scrapy.Item):id = scrapy.Field()           # 序号stock_code = scrapy.Field()  # 股票代码stock_name = scrapy.Field()  # 股票名称current_price = scrapy.Field() # 当前价change_amount = scrapy.Field() # 涨跌额change_rate = scrapy.Field()  # 涨跌幅（%）turnover = scrapy.Field()    # 成交量turnover_amount = scrapy.Field() # 成交额

    stockquote.py

点击查看代码

import scrapy
from eastmoney_project.items import EastmoneyProjectItem
import jsonclass StockquoteSpider(scrapy.Spider):name = 'stockquote'allowed_domains = ['push2.eastmoney.com', 'quote.eastmoney.com'] def start_requests(self):base_url = "http://push2.eastmoney.com/api/qt/clist/get"params = {'pn': '1', # 页码'pz': '20', # 每页数量'po': '1','np': '1','fltt': '2','invt': '2','fid': 'f3','fs': 'm:0 t:6,m:0 t:80,m:1 t:2,m:1 t:23','fields': 'f1,f2,f3,f4,f5,f6,f7,f12,f14' # 指定要返回的字段}# 将参数拼接成URLurl = base_url + '?' + '&'.join([f'{k}={v}' for k, v in params.items()])yield scrapy.Request(url=url, callback=self.parse)def parse(self, response):# 解析API返回的JSON数据data = json.loads(response.text)# 从JSON数据中提取股票列表，具体路径需要根据实际返回结构分析stock_list = data['data']['diff']for stock in stock_list:item = EastmoneyProjectItem()# 将JSON字段映射到我们自定义的Item字段item['stock_code'] = stock.get('f12') # 例如f12对应代码item['stock_name'] = stock.get('f14') # 例如f14对应名称item['current_price'] = stock.get('f2') # 当前价item['change_rate'] = stock.get('f3')   # 涨跌幅item['change_amount'] = stock.get('f4') # 涨跌额item['turnover'] = stock.get('f5')     # 成交量item['turnover_amount'] = stock.get('f6') # 成交额# ... 其他字段映射yield item

　
pipelines.py

点击查看代码

import pymysqlclass EastmoneyProjectPipeline:def __init__(self):self.conn = Noneself.cursor = Nonedef open_spider(self, spider):# 爬虫启动时建立数据库连接self.conn = pymysql.connect(host='localhost',user='root',  password='123456',  database='eastmoney',charset='utf8mb4')self.cursor = self.conn.cursor()def close_spider(self, spider):# 爬虫关闭时关闭数据库连接if self.cursor:self.cursor.close()if self.conn:self.conn.close()def process_item(self, item, spider):# 处理每一个Item，将其插入数据库sql = """INSERT INTO stock_data (stock_code, stock_name, current_price, change_amount, change_rate, turnover, turnover_amount)VALUES (%s, %s, %s, %s, %s, %s, %s)"""values = (item['stock_code'],item['stock_name'],item['current_price'],item['change_amount'],item['change_rate'],item['turnover'],item['turnover_amount'])try:self.cursor.execute(sql, values)self.conn.commit()  # 提交事务except Exception as e:self.conn.rollback()  # 发生错误时回滚print(f"数据库插入错误: {e}")return item

settings.py

点击查看代码

# Scrapy settings for eastmoney_project project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = "eastmoney_project"SPIDER_MODULES = ["eastmoney_project.spiders"]
NEWSPIDER_MODULE = "eastmoney_project.spiders"# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "eastmoney_project (+http://www.yourdomain.com)"# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "eastmoney_project.middlewares.EastmoneyProjectSpiderMiddleware": 543,
#}# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "eastmoney_project.middlewares.EastmoneyProjectDownloaderMiddleware": 543,
#}# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {"eastmoney_project.pipelines.EastmoneyProjectPipeline": 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

　
运行结果：

　
作业3:
要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站：中国银行网：https://www.boc.cn/sourcedb/whpj/
输出信息：
Gitee文件夹链接：https://gitee.com/haimisha/2025_creat_project/tree/master/作业三/第三题

代码内容：
items.py

点击查看代码

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.htmlimport scrapyclass BocForexItem(scrapy.Item):currency = scrapy.Field()    # 货币名称tbp = scrapy.Field()         # 现汇买入价cbp = scrapy.Field()         # 现钞买入价tsp = scrapy.Field()         # 现汇卖出价csp = scrapy.Field()         # 现钞卖出价time = scrapy.Field()        # 发布时间

　 forex.spider.py

点击查看代码

import scrapy
from boc_forex.items import BocForexItemclass ForexSpiderSpider(scrapy.Spider):name = 'forex_spider'allowed_domains = ['www.boc.cn']start_urls = ['https://www.boc.cn/sourcedb/whpj/']def parse(self, response):self.logger.info(f"开始解析页面: {response.url}")# 使用更精确的 XPath 定位表格行（已验证可正常工作）rows = response.xpath('//div[@class="publish"]//table//tr')self.logger.info(f"找到 {len(rows)} 行数据")# 跳过表头（第一行），遍历其余数据行for i, row in enumerate(rows[1:]):try:# 提取当前行所有单元格的文本内容cells = row.xpath('./td//text()').getall()# 清理文本：去除空字符串和前后空格cells = [cell.strip() for cell in cells if cell.strip()]self.logger.info(f"第 {i+1} 行数据: {cells}")# 确保提取到足够的列数据（至少 6 列）if len(cells) >= 6:item = BocForexItem()item['currency'] = cells[0]  # 货币名称item['tbp'] = cells[1]       # 现汇买入价item['cbp'] = cells[2]       # 现钞买入价item['tsp'] = cells[3]       # 现汇卖出价item['csp'] = cells[4]       # 现钞卖出价item['time'] = cells[5]      # 发布时间yield itemelse:self.logger.warning(f"第 {i+1} 行数据不完整（仅 {len(cells)} 列），已跳过")except Exception as e:self.logger.error(f"解析第 {i+1} 行时出错: {str(e)}", exc_info=True)

    pipelines.py

点击查看代码

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html# useful for handling different item types with a single interface
import pymysqlclass BocForexPipeline:def __init__(self):self.conn = Noneself.cursor = Nonedef open_spider(self, spider):# 数据库连接配置self.conn = pymysql.connect(host='localhost',user='root',password='123456',  database='boc_forex',charset='utf8mb4')self.cursor = self.conn.cursor()def close_spider(self, spider):if self.cursor:self.cursor.close()if self.conn:self.conn.close()def process_item(self, item, spider):sql = """INSERT INTO forex_data (currency, tbp, cbp, tsp, csp, time)VALUES (%s, %s, %s, %s, %s, %s)"""values = (item.get('currency'),item.get('tbp'),item.get('cbp'),item.get('tsp'),item.get('csp'),item.get('time'))try:self.cursor.execute(sql, values)self.conn.commit()except Exception as e:self.conn.rollback()print(f"数据库插入错误: {e}")return item

settings.py

点击查看代码

# Scrapy settings for boc_forex project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = "boc_forex"SPIDER_MODULES = ["boc_forex.spiders"]
NEWSPIDER_MODULE = "boc_forex.spiders"# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "boc_forex (+http://www.yourdomain.com)"# Obey robots.txt rules
ROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)
#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "boc_forex.middlewares.BocForexSpiderMiddleware": 543,
#}# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    "boc_forex.middlewares.BocForexDownloaderMiddleware": 543,
#}# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {"boc_forex.pipelines.BocForexPipeline": 300,
}# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

运行结果：

心得体会：数据提取环节，我深刻体会到XPath选择器的重要性。最初，我写的XPath路径经常无法准确定位到目标数据，通过浏览器开发者工具反复调试和测试，才最终找到了稳定的定位方式。特别是在处理中国银行外汇牌价表格时，需要仔细分析HTML结构，跳过表头行，才能准确提取每条外汇记录的货币名称、买入价、卖出价等字段这次数据采集实践作业让我深刻体会到理论知识与实际应用之间的距离，以及跨越这段距离带来的成长。Scrapy框架的强大功能与MySQL数据库的稳定存储相结合，为数据处理提供了完整的解决方案

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.mzph.cn/news/974339.shtml

如若内容造成侵权/违法违规/事实不符，请联系多彩编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！