作业①:
要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
简介: 通过F12网络功能查找关键元素“晴转多云”,快速定位数据所在,发现在https://www.weather.com.cn/weather/101010100.shtml异步加载,观察发现不同城市数据映射url的字符串相同。
通过查找发现https://j.i8tq.com/weather2020/search/city.js中存储了城市->字符串映射关系,且为标准字典,可根据其查询。
心得:深刻体会到逆向思维在数据采集中的重要性——从页面显示的"晴转多云"等关键词出发,通过浏览器开发者工具反向追踪数据源,成功定位到隐藏的异步接口和城市编码映射文件。这不仅揭示了网页表面结构与真实数据源的差异,更让我认识到动态网站的数据往往通过API异步加载,直接分析网络请求比解析静态HTML更能高效获取结构化数据。
image
image
`
import requests
import re
import sqlite3
from bs4 import BeautifulSoup
def get_weather_data(city_id):
url = f"https://www.weather.com.cn/weather/{city_id}.shtml"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8' # 防止乱码
text = response.text
info = []
soup = BeautifulSoup(text, 'lxml')
tags = soup.select("ul[class='t clearfix'] li")
for tag in tags:
day = tag.find("h1").get_text().strip()
weather = tag.find("p", class_="wea").get_text().strip()
tem = tag.find("p", class_="tem").get_text().strip()
info.append((day, weather, tem))
return info
class WeatherDB:
def init(self, db_name='WeatherDB.db'):
self.db_name = db_name
self.create_table()
def get_connection(self):
"""获取数据库连接"""
return sqlite3.connect(self.db_name)
def create_table(self):
"""创建天气表"""
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS weather (id INTEGER PRIMARY KEY AUTOINCREMENT, city TEXT NOT NULL, date TEXT NOT NULL, weather TEXT NOT NULL, temperature TEXT NOT NULL )
''')conn.commit()
conn.close()
print("天气表创建成功!")
def add_weather(self, city, date, weather, temperature):
"""添加天气记录"""
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('''INSERT INTO weather (city, date, weather, temperature)VALUES (?, ?, ?, ?)
''', (city, date, weather, temperature))conn.commit()
conn.close()
print("天气记录添加成功!")
def get_weather_by_city(self, city):
"""根据城市获取天气记录"""
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('''SELECT city, date, weather, temperatureFROM weatherWHERE city = ?ORDER BY date
''', (city,))records = cursor.fetchall()
conn.close()
return records # 返回 (city, date, weather, temperature) 列表
if name == "main":
city_id = {
"北京": "101010100",
"上海": "101020100",
"广州": "101280101",
"深圳": "101280601",
"杭州": "101210101"
}
citys = ["北京", "上海", "广州", "深圳", "杭州"]
db = WeatherDB()
records = db.get_weather_by_city("北京")
for record in records:
print(record)
`
作业②
要求:用requests和json解析方法定向爬取股票(东方财富网:https://www.eastmoney.com/)相关信息,并存储在数据库中
简介:东方财富网站传输数据在接口https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery3710324951759523461_1761760021174&fs=板块参数&fields=板块参数&fid=f3&pn=翻页数&pz=每页数量&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%7C0%7Cweb&_=时间戳,加载数据使用jsonp文件,简短裁剪可以化作json文件解析
心得:通过带时间戳的动态回调函数包装JSON数据,只需识别并去除jQuery回调函数(前缀和)后缀,即可将JSONP转换为标准JSON格式进行解析。这让我认识到很多网站采用JSONP实现跨域数据交互,核心在于理解其回调机制并做相应清洗处理,相比解析HTML页面能更高效地获取结构化数据
image
`
import requests
import os
import json
import csv
def get_url(page):
url = (f"https://push2.eastmoney.com/api/qt/clist/get?np=1&fltt=1&invt=2&cb=jQuery37105104315725764869_1761718278335"
"&fs=m%3A0%2Bt%3A6%2Bf%3A!2%2Cm%3A0%2Bt%3A80%2Bf%3A!2%2Cm%3A1%2Bt%3A2%2Bf%3A!2%2Cm%3A1%2Bt%3A23%2Bf%3A!2%2Cm%3A0%2Bt%3A81%2Bs%3A262144%2Bf%3A!2"
"&fields=f12%2Cf13%2Cf14%2Cf1%2Cf2%2Cf4%2Cf3%2Cf152%2Cf5%2Cf6%2Cf7%2Cf15%2Cf18%2Cf16%2Cf17%2Cf10%2Cf8%2Cf9%2Cf23"
"&fid=f3"
"&pn="+str(page)+"&pz=20&po=1&dect=1&ut=fa5fd1943c7b386f172d6893dbfba10b&wbp2u=%7C0%7C0%7C0%7Cweb&_=")
return url
def get_data(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
"Accept": "/",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Referer": "https://quote.eastmoney.com/",
"Origin": "https://quote.eastmoney.com",
"Sec-Ch-Ua": '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "script",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-site",
"Connection": "keep-alive",
"Host": "push2.eastmoney.com"
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8' # 防止乱码
response = response.text
data = parse_jsonp_simple(response)
return data
def parse_jsonp_simple(jsonp_str):
找到第一个左括号和最后一个右括号
start = jsonp_str.find('(')
end = jsonp_str.rfind(')')
if start != -1 and end != -1:
json_str = jsonp_str[start+1:end]
return json.loads(json_str)
else:
raise ValueError("无效的JSONP格式")
with open("eastmoney.csv", 'w', encoding='utf-8') as f:
f.write("序号,股票代码,股票名称,最新价,涨跌幅(%),涨跌额,成交量(万手),成交额(亿元),振幅(%),最高价,最低价,今开,昨收\n")
for i in range(1,100):
url = get_url(i)
data = get_data(url)
news_list = data['data']['diff']
with open("eastmoney.csv", 'a', encoding='utf-8') as f:
for j,stock in enumerate(news_list, 1):
f.write(
f"{(i-1)*20+j},"
f"{stock.get('f12', '')}," # 股票代码
f"{stock.get('f14', '')},"# 股票名称
f"{stock.get('f2', 0)/100:.2f}," # 最新价
f"{stock.get('f3', 0)/100:.2f}," # 涨跌幅(%)
f"{stock.get('f4', 0)/100:.2f}," # 涨跌额
f"{stock.get('f5', 0)/10000:.2f}," # 成交量转万手
f"{stock.get('f6', 0)/100000000:.2f}," # 成交额转亿元
f"{stock.get('f7', 0)/100:.2f}," # 振幅(%)
f"{stock.get('f15', 0)/100:.2f}," # 最高价
f"{stock.get('f16', 0)/100:.2f}," # 最低价
f"{stock.get('f17', 0)/100:.2f}," # 今开
f"{stock.get('f18', 0)/100:.2f}\n") # 昨收
`
作业③:
要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
简介:所需数据在https://www.shanghairanking.cn/_nuxt/static/1761118404/rankings/bcur/2021/payload.js接口中,score数据很好找,但是province和Category数据用1-2字节字母替代。观察js代码发现function代码输入(开头)与结尾一一对应。
心得:该网站将完整数据预渲染在payload.js文件中,虽然省份和院校类型字段采用1-2字节编码替代,但通过仔细分析JavaScript函数映射关系成功破解了编码规则。这让我认识到前端框架常采用静态化优化,关键数据往往隐藏在看似复杂的文件结构中,需要耐心分析JS代码逻辑才能完整解码数据含义,相比动态接口反而提供了更稳定的数据源
rank
`import requests
import re
from bs4 import BeautifulSoup
import sqlite3
def get_dict():
dict = {}
a=("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, A, B, C, D, E, F, G, H, I, J,"
" K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, , $, aa, ab, ac, ad, ae, af, ag, ah, ai, aj, ak, al, am,"
" an, ao, ap, aq, ar, as, at, au, av, aw, ax, ay, az, aA, aB, aC, aD, aE, aF, aG, aH, aI, aJ, aK, aL, aM, aN, "
"aO, aP, aQ, aR, aS, aT, aU, aV, aW, aX, aY, aZ, a, a$, ba, bb, bc, bd, be, bf, bg, bh, bi, bj, bk, bl, bm,"
" bn, bo, bp, bq, br, bs, bt, bu, bv, bw, bx, by, bz, bA, bB, bC, bD, bE, bF, bG, bH, bI, bJ, bK, bL, bM, bN,"
" bO, bP, bQ, bR, bS, bT, bU, bV, bW, bX, bY, bZ, b_, b$, ca, cb, cc, cd, ce, cf, cg, ch, ci, cj, ck, cl, cm,"
"cn, co, cp, cq, cr, cs, ct, cu, cv, cw, cx, cy, cz, cA, cB, cC, cD, cE, cF, cG, cH, cI, cJ, cK, cL, cM, cN, "
"cO, cP, cQ, cR, cS, cT, cU, cV, cW, cX, cY, cZ, c_, c$, da, db, dc, dd, de, df, dg, dh, di, dj, dk, dl, dm,"
" dn, do0, dp, dq, dr, ds, dt, du, dv, dw, dx, dy, dz, dA, dB, dC, dD, dE, dF, dG, dH, dI, dJ, dK, dL, dM, dN,"
" dO, dP, dQ, dR, dS, dT, dU, dV, dW, dX, dY, dZ, d_, d$, ea, eb, ec, ed, ee, ef, eg, eh, ei, ej, ek, el, em, "
"en, eo, ep, eq, er, es, et, eu, ev, ew, ex, ey, ez, eA, eB, eC, eD, eE, eF, eG, eH, eI, eJ, eK, eL, eM, eN, eO,"
" eP, eQ, eR, eS, eT, eU, eV, eW, eX, eY, eZ, e_, e$, fa, fb, fc, fd, fe, ff, fg, fh, fi, fj, fk, fl, fm, fn, "
"fo, fp, fq, fr, fs, ft, fu, fv, fw, fx, fy, fz, fA, fB, fC, fD, fE, fF, fG, fH, fI, fJ, fK, fL, fM, fN, fO, "
"fP, fQ, fR, fS, fT, fU, fV, fW, fX, fY, fZ, f_, f$, ga, gb, gc, gd, ge, gf, gg, gh, gi, gj, gk, gl, gm, gn,"
" go, gp, gq, gr, gs, gt, gu, gv, gw, gx, gy, gz, gA, gB, gC, gD, gE, gF, gG, gH, gI, gJ, gK, gL, gM, gN, gO,"
" gP, gQ, gR, gS, gT, gU, gV, gW, gX, gY, gZ, g_, g$, ha, hb, hc, hd, he, hf, hg, hh, hi, hj, hk, hl, hm, hn, "
"ho, hp, hq, hr, hs, ht, hu, hv, hw, hx, hy, hz, hA, hB, hC, hD, hE, hF, hG, hH, hI, hJ, hK, hL, hM, hN, hO, "
"hP, hQ, hR, hS, hT, hU, hV, hW, hX, hY, hZ, h_, h$, ia, ib, ic, id, ie, if0, ig, ih, ii, ij, ik, il, im, in0,"
" io, ip, iq, ir, is, it, iu, iv, iw, ix, iy, iz, iA, iB, iC, iD, iE, iF, iG, iH, iI, iJ, iK, iL, iM, iN, iO,"
" iP, iQ, iR, iS, iT, iU, iV, iW, iX, iY, iZ, i_, i$, ja, jb, jc, jd, je, jf, jg, jh, ji, jj, jk, jl, jm, jn,"
"jo, jp, jq, jr, js, jt, ju, jv, jw, jx, jy, jz, jA, jB, jC, jD, jE, jF, jG, jH, jI, jJ, jK, jL, jM, jN, jO, "
"jP, jQ, jR, jS, jT, jU, jV, jW, jX, jY, jZ, j_, j$, ka, kb, kc, kd, ke, kf, kg, kh, ki, kj, kk, kl, km, kn,"
" ko, kp, kq, kr, ks, kt, ku, kv, kw, kx, ky, kz, kA, kB, kC, kD, kE, kF, kG, kH, kI, kJ, kK, kL, kM, kN, kO,"
" kP, kQ, kR, kS, kT, kU, kV, kW, kX, kY, kZ, k_, k$, la, lb, lc, ld, le, lf, lg, lh, li, lj, lk, ll, lm, ln,"
" lo, lp, lq, lr, ls, lt, lu, lv, lw, lx, ly, lz, lA, lB, lC, lD, lE, lF, lG, lH, lI, lJ, lK, lL, lM, lN, lO,"
" lP, lQ, lR, lS, lT, lU, lV, lW, lX, lY, lZ, l_, l$, ma, mb, mc, md, me, mf, mg, mh, mi, mj, mk, ml, mm, mn, "
"mo, mp, mq, mr, ms, mt, mu, mv, mw, mx, my, mz, mA, mB, mC, mD, mE, mF, mG, mH, mI, mJ, mK, mL, mM, mN, mO, mP,"
" mQ, mR, mS, mT, mU, mV, mW, mX, mY, mZ, m_, m$, na, nb, nc, nd, ne, nf, ng, nh, ni, nj, nk, nl, nm, nn, no,"
" np, nq, nr, ns, nt, nu, nv, nw, nx, ny, nz, nA, nB, nC, nD, nE, nF, nG, nH, nI, nJ, nK, nL, nM, nN, nO, nP, "
"nQ, nR, nS, nT, nU, nV, nW, nX, nY, nZ, n_, n$, oa, ob, oc, od, oe, of, og, oh, oi, oj, ok, ol, om, on, oo, "
"op, oq, or, os, ot, ou, ov, ow, ox, oy, oz, oA, oB, oC, oD, oE, oF, oG, oH, oI, oJ, oK, oL, oM, oN, oO, oP, "
"oQ, oR, oS, oT, oU, oV, oW, oX, oY, oZ, o_, o$, pa, pb, pc, pd, pe, pf, pg, ph, pi, pj, pk, pl, pm, pn, po,"
" pp, pq, pr, ps, pt, pu, pv, pw, px, py, pz, pA, pB, pC, pD, pE, pF, pG, pH, pI, pJ")
a1 = a.split(', ')
b = ('"", false, null, 0, "理工", "综合", true, "师范", "双一流", "211", "江苏", "985", "农业", "山东", "河南", '
'"河北", "北京", "辽宁", "陕西", "四川", "广东", "湖北", "湖南", "浙江", "安徽", "江西", "黑龙江", "吉林", "上海", '
'"福建", "山西", "云南", "广西", "云南", "贵州", "甘肃", "内蒙古", "重庆", "天津", "新疆", 1, "467", "496",'
' "2023-01-05T00:00:00+08:00", "林业", "林业", "5.8", "533", "23.1", "7.3", '
'"海南", "37.9", "28.0", "4.3", "12.1", "16.8", "11.7", "3.7", "4.6", "297", "397", "21.8", "32.2", '
'"16.6", "37.6", "24.6", "13.6", "13.9", "3.3", "5.2", "8.1", "3.9", "5.1", "5.6", "5.4", "2.6", '
'"162", 93.5, 89.4, 11, 14, 10, 13, "宁夏", "青海", "西藏", "11.3", "35.2", "9.5", "35.0", "32.7", "23.7",'
' "33.2", "9.2", "30.6", "8.5", "22.7", "26.3", "8.0", "10.9", "26.0", "3.2", "6.8", "5.7", "13.8", "6.5", '
'"5.5", "5.0", "13.2", "13.3", "15.6", "18.3", "3.0", "21.3", "12.0", "22.8", "3.6", "3.4", "3.5", "95",'
' "109", "117", "129", "138", "147", "159", "185", "191", "193", "196", "213", "232", "237", "240", "267", '
'"275", "301", "309", "314", "318", "332", "334", "339", "341", "354", "365", "371", "378", "384", "388",'
' "403", "416", "418", "420", "423", "430", "438", "444", "449", "452", "457", "461", "465", "474", '
'"477", "485", "487", "491", "501", "508", "513", "518", "522", "528", 83.4, "538", "555", 2021, 7,'
'"12.8", "42.9", "18.8", "36.6", "4.8", "40.0", "37.7", "11.9", "45.2", "31.8", "10.4", "40.3", "11.2", '
'"30.9", "37.8", "16.1", "19.7", "11.1", "23.8", "29.1", "0.2", "24.0", "27.3", "24.9", "39.5", "20.5",'
' "23.4", "9.0", "4.1", "25.6", "12.9", "6.4", "18.0", "24.2", "7.4", "29.7", "26.5", "22.6", "29.9",'
' "28.6", "10.1", "16.2", "19.4", "19.5", "18.6", "27.4", "17.1", "16.0", "27.6", "7.9", "28.7", "19.3", '
'"29.5", "38.2", "8.9", "3.8", "15.7", "13.5", "1.7", "16.9", "33.4", "132.7", "15.2", "8.7", "20.3", "5.3",'
' "0.3", "4.0", "17.4", "2.7", "160", "161", "164", "165", "166", "167", "168", 130.6, 105.5, 4, 2024, 15, '
'"中国大学排名(主榜)", 25, 12, "全部", "1", "88.0", 5, "2", "36.1", "25.9", "3", "34.3", 6, "4", "35.5", "21.6",'
' "39.2", "5", "10.8", "4.9", "30.4", "6", "46.2", "7", "0.8", "42.1", "8", "32.1", "22.9", "31.3", "9", '
'"43.0", "25.7", "10", "34.5", "10.0", "26.2", "46.5", "宁夏", "47.0", "33.5", "35.8", "25.8", "12", "46.7",'
' "13.7", "31.4", "33.3", "13", "34.8", "42.3", "13.4", "29.4", "14", "30.7", "15", "42.6", "26.7", "16", '
'"12.5", "17", "12.4", "44.5", "44.8", "18", "10.3", "15.8", "19", "32.3", "19.2", "20", "21", "28.8", "9.6",'
' "22", "45.0", "23", "30.8", "16.7", "16.3", "24", "25", "32.4", "26", "9.4", "27", "33.7", "18.5", "21.9",'
' "28", "30.2", "31.0", "16.4", "29", "34.4", "41.2", "2.9", "30", "38.4", "6.6", "31", "4.4", "17.0", "32", '
'"26.4", "33", "6.1", "34", "38.8", "17.7", "35", "36", "38.1", "11.5", "14.9", "37", "14.3", "18.9", "38",'
' "13.0", "39", "27.8", "33.8", "3.1", "40", "41", "28.9", "42", "28.5", "38.0", "34.0", "1.5", "43", "15.1", '
'"44", "31.2", "120.0", "14.4", "45", "149.8", "7.5", "46", "47", "38.6", "48", "49", "25.2", "50", "19.8", '
'"51", "5.9", "6.7", "52", "4.2", "53", "1.6", "54", "55", "20.0", "56", "39.8", "18.1", "57", "35.6", "58",'
' "10.5", "14.1", "59", "8.2", "60", "140.8", "12.6", "61", "62", "17.6", "63", "64", "1.1", "65", "20.9", '
'"66", "67", "68", "2.1", "69", "123.9", "27.1", "70", "25.5", "37.4", "71", "72", "73", "74", "75", "76", '
'"27.9", "7.0", "77", "78", "79", "80", "81", "82", "83", "84", "1.4", "85", "86", "87", "88", "89", "90",'
' "91", "92", "93", "109.0", "94", 235.7, "97", "98", "99", "100", "101", "102", "103", "104", "105", "106",'
' "107", "108", 223.8, "111", "112", "113", "114", "115", "116", 215.5, "119", "120", "121", "122", "123", '
'"124", "125", "126", "127", "128", 206.7, "131", "132", "133", "134", "135", "136", "137", 201, "140",'
' "141", "142", "143", "144", "145", "146", 194.6, "149", "150", "151", "152", "153", "154", "155", "156", '
'"157", "158", 183.3, "169", "170", "171", "172", "173", "174", "175", "176", "177", "178", "179", "180",'
' "181", "182", "183", "184", 169.6, "187", "188", "189", "190", 168.1, 167, "195", 165.5, "198", "199", '
'"200", "201", "202", "203", "204", "205", "206", "207", "208", "209", "210", "212", 160.5, "215", "216",'
' "217", "218", "219", "220", "221", "222", "223", "224", "225", "226", "227", "228", "229", "230", "231", '
'153.3, "234", "235", "236", 150.8, "239", 149.9, "242", "243", "244", "245", "246", "247", "248", "249", '
'"250", "251", "252", "253", "254", "255", "256", "257", "258", "259", "260", "261", "262", "263", "264", '
'"265", "266", 139.7, "269", "270", "271", "272", "273", "274", 137, "277", "278", "279", "280", "281", "282",'
' "283", "284", "285", "286", "287", "288", "289", "290", "291", "292", "293", "294", "295", "296", "300",'
' 130.2, "303", "304", "305", "306", "307", "308", 128.4, "311", "312", "313", 125.9, "316", "317", 124.9, '
'"320", "321", "Wuyi University", "322", "323", "324", "325", "326", "327", "328", "329", "330", "331", '
'120.9, 120.8, "Taizhou University", "336", "337", "338", 119.9, 119.7, "343", "344", "345", "346", "347", '
'"348", "349", "350", "351", "352", "353", 115.4, "356", "357", "358", "359", "360", "361", "362", "363",'
' "364", 112.6, "367", "368", "369", "370", 111, "373", "374", "375", "376", "377", 109.4, "380", "381",'
' "382", "383", 107.6, "386", "387", 107.1, "390", "391", "392", "393", "394", "395", "396", "400", "401", '
'"402", 104.7, "405", "406", "407", "408", "409", "410", "411", "412", "413", "414", "415", 101.2, 101.1, '
'100.9, "422", 100.3, "425", "426", "427", "428", "429", 99, "432", "433", "434", "435", "436", "437", 97.6,'
' "440", "441", "442", "443", 96.5, "446", "447", "448", 95.8, "451", 95.2, "454", "455", "456", 94.8, "459", '
'"460", 94.3, "463", "464", 93.6, "472", "473", 92.3, "476", 91.7, "479", "480", "481", "482", "483", "484", '
'90.7, 90.6, "489", "490", 90.2, "493", "494", "495", 89.3, "503", "504", "505", "506", "507", 87.4, "510", '
'"511", "512", 86.8, "515", "516", "517", 86.2, "520", "521", 85.8, "524", "525", "526", "527", 84.6, "530",'
' "531", "532", "537", 82.8, "540", "541", "542", "543", "544", "545", "546", "547", "548", "549", "550", '
'"551", "552", "553", "554", 78.1, "557", "558", "559", "560", "561", "562", "563", "564", "565", "566", "567",'
' "568", "569", "570", "571", "572", "573", "574", "575", "576", "577", "578", "579", "580", "581", "582", 9, '
'"2024-04-18T00:00:00+08:00", '
'"logo\u002Fannual\u002Fbcur\u002F2024.png", "软科中国大学排名于2015年首次发布,多年来以专业、客观、透明的优势赢得了高等教育领域内外的广泛关注和认可,已经成为具有重要社会影响力和权威参考价值的中国大学排名领先品牌。软科中国大学排名以服务中国高等教育发展和进步为导向,采用数百项指标变量对中国大学进行全方位、分类别、监测式评价,向学生、家长和全社会提供及时、可靠、丰富的中国高校可比信息。", '
'"学生、家长、高校管理人员、高教研究人员等", 2023, 2022, 2020, 2019, 2018, 2017, 2016, 2015, '
'"logo\u002FindAnalysis\u002Fbcur.png", "中国大学排名", "国内", "大学"')
b1 = b.split(', ')
for i in range(len(a1)):
a1[i] = a1[i].strip().strip('"').strip("'")
b1[i] = b1[i].strip().strip('"').strip("'")
dict[a1[i]] = b1[i]
return dict
def get_str(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}
request = requests.get(url, headers=headers)
request.encoding = 'utf-8' # 防止乱码
str = request.text
return str
class Ranker:
def init(self, db_name='rank_2021.db'):
self.db_name = db_name
self.create_table()
def get_connection(self):
"""获取数据库连接"""
return sqlite3.connect(self.db_name)
def create_table(self):
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS rank (id INTEGER PRIMARY KEY AUTOINCREMENT,name TEXT NOT NULL,score int,category varchar(20),province varchar(20))
''')
conn.commit()
conn.close()
print("rank表创建成功!")
def add_data(self, name,score,category,province):
"""添加数据"""
conn = self.get_connection()
cursor = conn.cursor()
try:cursor.execute('''INSERT INTO rank (name,score,category,province) VALUES (?,?,?,?)''', (name,score,category,province))conn.commit()return True
except Exception as e:print(f"添加学生时出错:{e}")return False
finally:conn.close()
def get_data_by_name(self, name):
"""搜索学生"""
conn = self.get_connection()
cursor = conn.cursor()
cursor.execute('''
SELECT name,score,category,province
FROM rank
WHERE name = ?''', (name,))
rank = cursor.fetchall()
conn.close()
return rank
url = "https://www.shanghairanking.cn/_nuxt/static/1761118404/rankings/bcur/2021/payload.js"
txt = get_str(url)
dict = get_dict()
name = re.findall(',univNameCn:"(.?)",', txt) # 获取学校名称
score = re.findall(',score:(.?),', txt) # 获取学校总分
category = re.findall(',univCategory:(.?),', txt) # 获取学校类型
province = re.findall(😦.?),', txt) # 获取省市
ranker = Ranker()
for i in range(len(name)):
nm = name[i]
sc = float(dict[score[i]]) if score[i] in dict else score[i]
cg = str(dict[category[i]]) if category[i] in dict else category[i]
pv = str(dict[province[i]]) if province[i] in dict else province[i]
print(nm,sc,cg,pv)
添加单条数据
ranker.add_data(nm,sc,cg,pv)
print("success")