作业一
代码及其展示部分
点击查看代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3class WeatherDB:def openDB(self):self.con = sqlite3.connect("weathers.db")self.cursor = self.con.cursor()try:self.cursor.execute("""create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))""")except:self.cursor.execute("delete from weathers")def closeDB(self):self.con.commit()self.con.close()def insert(self, city, date, weather, temp):try:self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",(city, date, weather, temp))except Exception as err:print(f"插入数据失败:{err}")def show(self):self.cursor.execute("select * from weathers")rows = self.cursor.fetchall()print("%-16s%-16s%-32s%-16s" % ("城市", "日期", "天气", "温度"))for row in rows:print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))class WeatherForecast:def __init__(self):self.headers = {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}self.cityCode = {"北京": "101010100","上海": "101020100","广州": "101280101","深圳": "101280601"}def forecastCity(self, city):if city not in self.cityCode.keys():print(f"{city}:未找到对应城市代码")returnurl = f"http://www.weather.com.cn/weather/{self.cityCode[city]}.shtml"try:req = urllib.request.Request(url, headers=self.headers)data = urllib.request.urlopen(req)data = data.read()dammit = UnicodeDammit(data, ["utf-8", "gbk"])data = dammit.unicode_markupsoup = BeautifulSoup(data, "lxml")lis = soup.select("ul[class='t clearfix'] li")for li in lis:try:date = li.select('h1')[0].textweather = li.select('p[class="wea"]')[0].text# 修复温度提取的引号冲突(用变量拆分,彻底避免嵌套问题)max_temp = li.select('p[class="tem"] span')[0].textmin_temp = li.select('p[class="tem"] i')[0].texttemp = f"{max_temp}/{min_temp}" # 单独拼接,无引号嵌套print(f"{city} | {date} | {weather} | {temp}")self.db.insert(city, date, weather, temp)except Exception as err:print(f"提取单条天气数据失败:{err}")except Exception as err:print(f"爬取{city}天气页面失败:{err}")def process(self, cities):self.db = WeatherDB()self.db.openDB()for city in cities:self.forecastCity(city)# self.db.show()self.db.closeDB()# 执行逻辑
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("天气数据爬取与存储完成")

心得体会
在温度提取时遇到引号嵌套冲突,最后通过拆分变量、单独拼接的方式成功解决。
掌握了 BeautifulSoup 的核心用法,学会通过 CSS 选择器精准定位网页元素,比如从复杂的 HTML 结构中提取日期、天气、温度等目标数据,理解了网页解析的核心逻辑是 “定位 - 提取 - 清洗”。
通过本次实践,我不仅掌握了具体的技术技能,更培养了 “发现问题 - 分析问题 - 解决问题” 的思维模式,为后续开展更复杂的数据分析或爬虫项目积累了宝贵经验。
作业二
代码及其结果展示
点击查看代码
import requests
import json
import time
import csv
import osdef get_stock_data(page):url = "http://69.push2.eastmoney.com/api/qt/clist/get"params = {"pn": page, "pz": 20, "po": 1, "np": 1,"ut": "bd1d9ddb00efe4882cddb8fe999b62f7c","fltt": 2, "invt": 2, "fid": "f3","fs": "m:0+f:8,m:1+f:8","fields": "f12,f14,f2,f3,f4,f5,f6,f7","_": int(time.time() * 1000)}headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36","Referer": "http://quote.eastmoney.com/"}resp = requests.get(url, params=params, headers=headers)data = json.loads(resp.text)return data["data"]["diff"] if data.get("data") else []def print_and_save_stocks(stocks, start_idx, csv_path, is_first):# 打印数据if is_first:print("序号 代码 名称 最新价 涨跌幅 涨跌额 成交量(万手) 成交额(亿) 振幅")for i, stock in enumerate(stocks, start_idx):code = stock.get("f12", "")name = stock.get("f14", "")price = round(stock.get("f2", 0.0), 2)chg_pct = round(stock.get("f3", 0.0), 2)chg_amt = round(stock.get("f4", 0.0), 2)vol_ten_thousand = round(stock.get("f5", 0) / 10000, 2)amt_100m = round(stock.get("f6", 0.0) / 100000000, 2)amp = round(stock.get("f7", 0.0), 2)print(f"%2d %6s %-8s %6.2f %6.2f%% %7.2f %12.2f %10.2f %6.2f%%"% (i, code, name, price, chg_pct, chg_amt, vol_ten_thousand, amt_100m, amp))# 保存CSVcsv_dir = os.path.dirname(csv_path)if not os.path.exists(csv_dir):os.makedirs(csv_dir)headers = ["序号", "代码", "名称", "最新价", "涨跌幅(%)", "涨跌额", "成交量(万手)", "成交额(亿)", "振幅(%)"]with open(csv_path, "a", newline="", encoding="utf-8-sig") as f:writer = csv.DictWriter(f, fieldnames=headers)if is_first:writer.writeheader()for i, stock in enumerate(stocks, start_idx):writer.writerow({"序号": i,"代码": stock.get("f12", ""),"名称": stock.get("f14", ""),"最新价": round(stock.get("f2", 0.0), 2),"涨跌幅(%)": round(stock.get("f3", 0.0), 2),"涨跌额": round(stock.get("f4", 0.0), 2),"成交量(万手)": round(stock.get("f5", 0) / 10000, 2),"成交额(亿)": round(stock.get("f6", 0.0) / 100000000, 2),"振幅(%)": round(stock.get("f7", 0.0), 2)})def main():total_pages = 2csv_path = "股票/创新股数据/创新股股票数据.csv"for page in range(1, total_pages + 1):print(f"\n====== 第{page}页数据 ======")stocks = get_stock_data(page)if stocks:start_idx = (page - 1) * 20 + 1print_and_save_stocks(stocks, start_idx, csv_path, page == 1)print(f"第{page}页数据已保存到 {csv_path}")else:print(f"第{page}页未获取到有效数据")time.sleep(1)if __name__ == "__main__":main()


f2对应最新价,f3对应涨跌幅,f4对应涨跌额,f5对应成交量,f6对应成交额f7对应涨跌幅
由此可以提取到我们所需要的有效信息

心得体会
在数据存储时,为避免重复写入表头,通过is_first参数判断是否为第一页数据,确保表头只写入一次。同时,使用os.makedirs处理目录不存在的情况。
在接口分析方面,通过浏览器开发者工具抓包,我清晰掌握了股票数据加载的 API 接口规律。像参数pn控制页码、pz设定每页数据量,fields中的f12对应股票代码、f14代表股票名称等,这种键值映射关系让我明白,很多网站的数据加载并非直接渲染在 HTML 中,而是通过后台 API 动态返回 JSON 数据,这为定向爬取提供了更高效的路径。
在数据处理上,requests 库的使用让网络请求变得简洁高效,相比 urllib,其参数传递和响应处理更直观。而 JSON 解析则体现了结构化数据的优势,通过键名直接提取所需字段,避免了复杂的 HTML 解析,数据提取效率大幅提升。同时,对数据进行四舍五入等格式化处理,也让输出和存储的信息更规范易读。
作业三
代码及其结果展示
点击查看代码
import urllib.request
import re
import execjs
import sqlite3url = "https://www.shanghairanking.cn/_nuxt/static/1762223212/rankings/bcur/2021/payload.js"def extract_data(js_content):# 原始匹配模式m = re.search(r'__NUXT_JSONP__.*?function\(([^)]*)\){([\s\S]*)}\(([^)]*)\)', js_content)if not m:raise ValueError("未匹配到数据结构")params, body, args = m.group(1), m.group(2), m.group(3)# 清理JS函数体body = body.replace('\\', '').strip() # 移除转义字符body = re.sub(r'([{,]\s*)(\w+)(\s*:)', r'\1"\2"\3', body) # 为键添加引号body = re.sub(r';\s*}', '}', body) # 移除多余分号# 处理参数格式args_py = args.replace('false', 'False') \.replace('true', 'True') \.replace('null', 'None')# 执行JS函数ctx = execjs.compile(f"function f({params}){{ {body} }};")result = ctx.call("f", *eval(args_py))# 提取所需字段return [{'排名': u.get('ranking'),'学校': u.get('univNameCn'),'省市': u.get('province'),'类型': u.get('univCategory'),'总分': u.get('score')}for u in result['data'][0]['univData']]def init_db():conn = sqlite3.connect('university_rankings.db')cursor = conn.cursor()cursor.execute('''CREATE TABLE IF NOT EXISTS bcur_rankings (id INTEGER PRIMARY KEY AUTOINCREMENT,ranking TEXT,university_name TEXT,province TEXT,category TEXT,score TEXT)''')conn.commit()conn.close()def save_data(univs):conn = sqlite3.connect('university_rankings.db')cursor = conn.cursor()data = [(u['排名'], u['学校'], u['省市'], u['类型'], u['总分'])for u in univs]cursor.executemany('''INSERT INTO bcur_rankings (ranking, university_name, province, category, score) VALUES (?, ?, ?, ?, ?)''', data)conn.commit()conn.close()print(f"已保存 {len(univs)} 条数据")def show_data(univs):print("2021软科中国大学排名主榜")print(f"{'排名':<6} {'学校':<20} {'省市':<8} {'类型':<10} {'总分'}")print("-" * 55)for u in univs[:50]:print(f"{u['排名']:<6} {u['学校']:<20} {u['省市']:<8} {u['类型']:<10} {u['总分']}")if len(univs) > 50:print(f"... 共 {len(univs)} 条数据,显示前50条 ...")
def main():try:# 获取JS内容req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"})js_content = urllib.request.urlopen(req).read().decode()univs = extract_data(js_content)init_db()save_data(univs)show_data(univs)except Exception as e:print(f"错误: {str(e)}")
if __name__ == "__main__":
main()


心得体会
本次爬取2021软科中国大学排名主榜数据并存储到数据库的实践,让我在技术应用与问题解决中收获良多。起初分析网页时,发现数据并非直接嵌入HTML,而是通过JS动态加载,这促使我跳出常规爬取思路,定位到存储数据的payload.js文件,体会到"网页分析是爬取前提"的真谛。
数据提取环节是最大挑战,JS函数体的转义字符、未加引号的键名等问题,需要用正则表达式逐步清理;借助execjs执行JS函数时,参数格式的转换也让我加深了对跨语言数据交互的理解。而SQLite数据库的操作则让我意识到,合理的表结构设计和批量插入方法,是保障数据完整性与存储效率的关键。
https://gitee.com/lai-yixuan/2025_crawl_project/tree/master/作业2