102302124_严涛_第四次作业

news/2025/12/9 19:03:14/文章来源:https://www.cnblogs.com/yt2005/p/19328032

1.使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
(1)
代码:

点击查看代码
import sqlite3
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from typing import List, TupleCONFIG = {"db_name": "stocks.db","max_page": 3,"page_load_wait": 10,"implicit_wait": 5,"board_list": [("沪深A股", "#hs_a_board"),("上证A股", "#sh_a_board"),("深证A股", "#sz_a_board")]
}logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)class StockDataCrawler:def __init__(self):self.driver = Noneself.conn = Nonedef initialize_driver(self) -> webdriver.Chrome:options = Options()options.add_experimental_option('excludeSwitches', ['enable-automation'])options.add_argument('--disable-blink-features=AutomationControlled')driver = webdriver.Chrome(options=options)driver.maximize_window()driver.implicitly_wait(CONFIG["implicit_wait"])return driverdef initialize_database(self) -> sqlite3.Connection:conn = sqlite3.connect(CONFIG["db_name"])cursor = conn.cursor()cursor.execute("DROP TABLE IF EXISTS stock_data")create_table_sql = """CREATE TABLE IF NOT EXISTS stock_data (id INTEGER PRIMARY KEY AUTOINCREMENT,board_type TEXT NOT NULL,stock_code TEXT NOT NULL,stock_name TEXT NOT NULL,latest_price TEXT,change_percent TEXT,change_amount TEXT,volume TEXT,turnover TEXT,amplitude TEXT,high TEXT,low TEXT,open_price TEXT,prev_close TEXT,crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP)"""cursor.execute(create_table_sql)conn.commit()cursor.execute("CREATE INDEX IF NOT EXISTS idx_stock_code ON stock_data(stock_code)")cursor.execute("CREATE INDEX IF NOT EXISTS idx_board_type ON stock_data(board_type)")conn.commit()logger.info("数据库初始化完成")return conndef wait_for_table_load(self, timeout: int = 10):try:wait = WebDriverWait(self.driver, timeout)wait.until(EC.presence_of_element_located((By.XPATH, "//table//tbody/tr")))except Exception as e:logger.warning(f"等待表格加载超时: {e}")def parse_table_row(self, tr_element, board_name: str) -> Tuple:try:tds = tr_element.find_elements(By.TAG_NAME, "td")if len(tds) < 14:  return Nonereturn (board_name,tds[1].text.strip(),  # 股票代码tds[2].text.strip(),  # 股票名称tds[4].text.strip() if tds[4].text else "0.00",  # 最新价tds[5].text.strip() if tds[5].text else "0.00%",  # 涨跌幅tds[6].text.strip() if tds[6].text else "0.00",  # 涨跌额tds[7].text.strip() if tds[7].text else "0",  # 成交量tds[8].text.strip() if tds[8].text else "0.00万",  # 成交额tds[9].text.strip() if tds[9].text else "0.00%",  # 振幅tds[10].text.strip() if tds[10].text else "0.00",  # 最高tds[11].text.strip() if tds[11].text else "0.00",  # 最低tds[12].text.strip() if tds[12].text else "0.00",  # 今开tds[13].text.strip() if tds[13].text else "0.00"  # 昨收)except Exception as e:logger.error(f"解析行数据失败: {e}")return Nonedef crawl_board_data(self, board_name: str, board_code: str) -> int:total_rows = 0url = f"http://quote.eastmoney.com/center/gridlist.html{board_code}"logger.info(f"开始爬取 {board_name},URL: {url}")try:self.driver.get(url)self.wait_for_table_load()for page in range(1, CONFIG["max_page"] + 1):logger.info(f"  正在爬取第 {page} 页...")time.sleep(2)  tr_elements = self.driver.find_elements(By.XPATH, "//table//tbody/tr")if not tr_elements:logger.warning(f"第 {page} 页未找到数据")continuedata_to_save = []for tr in tr_elements:row_data = self.parse_table_row(tr, board_name)if row_data:data_to_save.append(row_data)if data_to_save:self.save_to_database(data_to_save)total_rows += len(data_to_save)logger.info(f"    第 {page} 页保存成功,共 {len(data_to_save)} 条记录")if page < CONFIG["max_page"]:if not self.go_to_next_page():logger.warning("无法翻页,可能已到最后一页")breaklogger.info(f"{board_name} 爬取完成,共获取 {total_rows} 条数据")return total_rowsexcept Exception as e:logger.error(f"爬取 {board_name} 时发生错误: {e}")return total_rowsdef go_to_next_page(self) -> bool:try:next_button = WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[title='下一页']")))self.driver.execute_script("arguments[0].click();", next_button)time.sleep(2)  self.wait_for_table_load()return Trueexcept Exception as e:logger.warning(f"翻页失败: {e}")return Falsedef save_to_database(self, data: List[Tuple]):if not data:returninsert_sql = """INSERT INTO stock_data (board_type, stock_code, stock_name, latest_price, change_percent, change_amount, volume, turnover, amplitude, high, low, open_price, prev_close)VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""try:cursor = self.conn.cursor()cursor.executemany(insert_sql, data)self.conn.commit()except Exception as e:logger.error(f"保存数据到数据库失败: {e}")self.conn.rollback()raisedef run(self):logger.info("股票数据爬虫开始运行...")try:self.conn = self.initialize_database()self.driver = self.initialize_driver()total_records = 0for board_name, board_code in CONFIG["board_list"]:records = self.crawl_board_data(board_name, board_code)total_records += recordscursor = self.conn.cursor()cursor.execute("SELECT COUNT(*) FROM stock_data")final_count = cursor.fetchone()[0]logger.info(f"爬虫任务完成!总共爬取 {total_records} 条数据,数据库中共有 {final_count} 条记录")except Exception as e:logger.error(f"爬虫运行失败: {e}", exc_info=True)finally:if self.driver:self.driver.quit()logger.info("浏览器已关闭")if self.conn:self.conn.close()logger.info("数据库连接已关闭")logger.info("爬虫程序运行结束")if __name__ == "__main__":import sysimport timeif len(sys.argv) > 1 and sys.argv[1] == "--test":CONFIG["max_page"] = 1  logger.info("进入测试模式,只爬取第一页")start_time = time.time()crawler = StockDataCrawler()crawler.run()end_time = time.time()logger.info(f"总运行时间: {end_time - start_time:.2f} 秒")
结果截图:

image
gitee链接:
https://gitee.com/yan-tao2380465352/2025_crawl_project/blob/master/第四次实践作业_eastmoney_sqlite.py
(2)心得体会:ChromeDriver 版本必须与本地 Chrome 大版本完全一致,否则直接闪退或无声失败;广告/弹层遮罩要用 try/except 点掉,否则元素在 DOM 里却不可交互。
2.使用Selenium框架+MySQL爬取中国mooc网课程资源信息
(1)
代码:

点击查看代码
# mooc_course_crawler_fixed.py
import json
import time
import os
import csv
import sqlite3
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from bs4 import BeautifulSoup
import tracebackclass MOOCCourseCrawlerFixed:"""中国大学MOOC课程爬虫 - 修复版"""def __init__(self, config=None):# 默认配置self.config = {"cookie_file": "icourse_cookies.json","chrome_driver_path": r"C:\Program Files\Google\Chrome\Application\chromedriver.exe","output_dir": "mooc_data","timeout": 15,  # 减少超时时间"max_pages": 3,  # 先少爬几页测试"courses_per_page": 10,  # 每页课程数"headless": False,"retry_times": 3,  # 重试次数}if config:self.config.update(config)self.driver = Noneself.wait = None# 创建输出目录os.makedirs(self.config["output_dir"], exist_ok=True)# 初始化数据库self.init_database()def init_database(self):"""初始化SQLite数据库"""db_path = os.path.join(self.config["output_dir"], "mooc_courses.db")self.conn = sqlite3.connect(db_path, check_same_thread=False)self.cursor = self.conn.cursor()create_table_sql = """CREATE TABLE IF NOT EXISTS courses (id INTEGER PRIMARY KEY AUTOINCREMENT,course_id TEXT NOT NULL,course_name TEXT NOT NULL,university TEXT,teacher TEXT,team TEXT,participants INTEGER,schedule TEXT,description TEXT,url TEXT,category TEXT,crawl_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP,UNIQUE(course_id))"""self.cursor.execute(create_table_sql)self.conn.commit()print(f" 数据库初始化完成: {db_path}")def restart_driver_if_needed(self):"""如果驱动失效,重新启动"""try:# 简单测试驱动是否还可用self.driver.current_urlreturn Trueexcept:print("  驱动失效,正在重启...")self.close_driver()return self.setup_driver()def setup_driver(self):"""设置浏览器驱动"""try:# 检查驱动是否存在if not os.path.exists(self.config["chrome_driver_path"]):print(f" ChromeDriver不存在: {self.config['chrome_driver_path']}")return False# 设置Chrome选项options = webdriver.ChromeOptions()if self.config["headless"]:options.add_argument('--headless')options.add_argument('--no-sandbox')options.add_argument('--disable-dev-shm-usage')options.add_experimental_option('excludeSwitches', ['enable-logging', 'enable-automation'])options.add_argument('--log-level=3')options.add_argument('--disable-blink-features=AutomationControlled')options.add_argument('--disable-gpu')options.add_argument('--disable-infobars')options.add_argument('--start-maximized')# 添加User-Agentoptions.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')# 启动浏览器service = Service(executable_path=self.config["chrome_driver_path"])self.driver = webdriver.Chrome(service=service, options=options)self.wait = WebDriverWait(self.driver, self.config["timeout"])# 设置页面加载超时self.driver.set_page_load_timeout(30)self.driver.set_script_timeout(30)print(" 浏览器启动成功")return Trueexcept Exception as e:print(f" 启动浏览器失败: {e}")return Falsedef load_cookies_safe(self):"""安全加载Cookie"""if not os.path.exists(self.config["cookie_file"]):print(f"  Cookie文件不存在: {self.config['cookie_file']}")print("将以游客身份访问")return True  # 继续但不登录try:with open(self.config["cookie_file"], "r", encoding="utf-8") as f:cookie_data = json.load(f)# 先访问首页self.driver.get("https://www.icourse163.org/")time.sleep(3)# 删除所有现有Cookieself.driver.delete_all_cookies()# 添加新Cookiecookies_added = 0for cookie in cookie_data.get("cookies", []):try:# 处理可能的格式问题cookie_to_add = cookie.copy()# 确保有必要的字段required_fields = ['name', 'value', 'domain']if not all(field in cookie_to_add for field in required_fields):continue# 转换expiry为整数if 'expiry' in cookie_to_add:cookie_to_add['expiry'] = int(cookie_to_add['expiry'])# 确保domain正确if not cookie_to_add['domain'].startswith('.'):cookie_to_add['domain'] = '.' + cookie_to_add['domain']self.driver.add_cookie(cookie_to_add)cookies_added += 1except Exception as e:print(f"  添加Cookie {cookie.get('name', 'unknown')} 失败: {e}")print(f" 成功添加 {cookies_added} 个Cookie")# 刷新页面self.driver.refresh()time.sleep(3)return Trueexcept Exception as e:print(f" 加载Cookie失败: {e}")return True  # 继续但不登录def get_safe(self, url, retry_times=3):"""安全的页面访问"""for attempt in range(retry_times):try:print(f"正在访问: {url} (尝试 {attempt + 1}/{retry_times})")self.driver.get(url)time.sleep(3)return Trueexcept TimeoutException:print(f"  页面加载超时,重试 {attempt + 1}/{retry_times}")if attempt < retry_times - 1:time.sleep(2)continueelse:print(" 页面加载失败")return Falseexcept WebDriverException as e:print(f"  驱动错误: {e}")if "invalid session id" in str(e) or "disconnected" in str(e):# 驱动失效,需要重启if not self.restart_driver_if_needed():return Falsetime.sleep(2)continuereturn Falsereturn Falsedef parse_course_list_page_simple(self, url):"""简单解析课程列表页面"""print(f" 正在解析: {url}")courses = []# 使用JavaScript直接获取页面数据try:# 等待页面基本加载time.sleep(3)# 获取页面源码page_source = self.driver.page_sourcesoup = BeautifulSoup(page_source, 'html.parser')# 方法1:尝试查找课程卡片course_selectors = ['.course-card',  # 常见选择器'.m-course-list .course-card','.j-course-list .course-card','.u-courseList .courseCard','.g-hot-course .course-card','[class*="courseCard"]','[class*="course-card"]',]for selector in course_selectors:course_cards = soup.select(selector)if course_cards:print(f"找到 {len(course_cards)} 个课程卡片 (选择器: {selector})")breakif not course_cards:# 方法2:查找所有可能的课程链接all_course_links = []for a_tag in soup.find_all('a', href=True):href = a_tag['href']if '/course/' in href and 'icourse163.org' not in href:# 提取课程信息course_info = {'url': 'https://www.icourse163.org' + href if href.startswith('/') else href,'title': a_tag.get_text(strip=True)}# 查找附近的学校信息parent_div = a_tag.find_parent('div')if parent_div:# 尝试在学校信息school_elem = parent_div.find(class_=lambda x: x and ('school' in x.lower() or 'uni' in x.lower()))if school_elem:course_info['university'] = school_elem.get_text(strip=True)all_course_links.append(course_info)print(f"找到 {len(all_course_links)} 个课程链接")course_cards = all_course_links# 提取课程信息for card in course_cards[:self.config["courses_per_page"]]:try:if isinstance(card, dict):# 从链接提取的信息course_info = {"course_id": "","course_name": card.get('title', '未知'),"university": card.get('university', '未知'),"teacher": "未知","team": "","participants": 0,"schedule": "未知","description": "","url": card.get('url', '')}# 从URL提取课程IDif '/course/' in course_info['url']:parts = course_info['url'].split('/course/')if len(parts) > 1:course_id = parts[1].split('/')[0].split('?')[0]course_info['course_id'] = course_idelse:# 从HTML元素提取course_info = self.extract_course_from_element(card)if course_info and course_info.get('course_name') and course_info.get('url'):courses.append(course_info)except Exception as e:print(f"️  提取单个课程失败: {e}")continuereturn coursesexcept Exception as e:print(f" 解析页面失败: {e}")traceback.print_exc()return []def extract_course_from_element(self, element):"""从HTML元素提取课程信息"""try:# 获取课程链接link_elem = element.find('a', href=True)if not link_elem:return Nonehref = link_elem['href']course_url = 'https://www.icourse163.org' + href if href.startswith('/') else href# 课程IDcourse_id = ""if '/course/' in course_url:parts = course_url.split('/course/')if len(parts) > 1:course_id = parts[1].split('/')[0].split('?')[0]# 课程名称course_name = "未知"name_selectors = ['.course-name', '.title', '.f-thide', '.u-course-name', '.name', 'h3', 'h4']for selector in name_selectors:name_elem = element.select_one(selector)if name_elem:course_name = name_elem.get_text(strip=True)break# 学校名称university = "未知"uni_selectors = ['.school-name', '.university', '.u-course-uni', '.school', '.uni']for selector in uni_selectors:uni_elem = element.select_one(selector)if uni_elem:university = uni_elem.get_text(strip=True)break# 教师teacher = "未知"teacher_selectors = ['.teacher-name', '.teacher', '.u-course-teacher', '.lecturer']for selector in teacher_selectors:teacher_elem = element.select_one(selector)if teacher_elem:teacher = teacher_elem.get_text(strip=True)break# 参加人数participants = 0count_selectors = ['.hot', '.count', '.participants', '.u-course-count', '.enrollment']for selector in count_selectors:count_elem = element.select_one(selector)if count_elem:count_text = count_elem.get_text(strip=True)try:if '万' in count_text:participants = int(float(count_text.replace('万', '').replace('人', '').replace('+', '').strip()) * 10000)elif 'k' in count_text.lower():participants = int(float(count_text.lower().replace('k', '').strip()) * 1000)else:participants = int(''.join(filter(str.isdigit, count_text)))except:participants = 0break# 课程进度schedule = "未知"schedule_selectors = ['.time', '.schedule', '.u-course-time', '.date', '.period']for selector in schedule_selectors:schedule_elem = element.select_one(selector)if schedule_elem:schedule = schedule_elem.get_text(strip=True)break# 课程简介description = ""desc_selectors = ['.brief', '.description', '.u-course-brief', '.intro']for selector in desc_selectors:desc_elem = element.select_one(selector)if desc_elem:description = desc_elem.get_text(strip=True)[:200]breakreturn {"course_id": course_id,"course_name": course_name,"university": university,"teacher": teacher,"team": teacher,  # 默认使用教师作为团队"participants": participants,"schedule": schedule,"description": description,"url": course_url}except Exception as e:print(f"  提取课程信息失败: {e}")return Nonedef crawl_single_page_safe(self, url, category="all"):"""安全地爬取单个页面"""print(f"\n 正在处理: {category}")for attempt in range(self.config["retry_times"]):try:# 访问页面if not self.get_safe(url):print(f" 访问页面失败,重试 {attempt + 1}/{self.config['retry_times']}")time.sleep(2)continue# 解析课程courses = self.parse_course_list_page_simple(url)if courses:print(f" 获取到 {len(courses)} 个课程")# 保存到数据库saved_count = 0for course in courses:course['category'] = categoryif self.save_to_database(course):saved_count += 1print(f" 保存 {saved_count} 个课程到数据库")return courseselse:print(" 未找到课程,尝试备用方法...")# 尝试滚动页面self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")time.sleep(2)# 再次解析courses = self.parse_course_list_page_simple(url)if courses:print(f" 滚动后找到 {len(courses)} 个课程")saved_count = 0for course in courses:course['category'] = categoryif self.save_to_database(course):saved_count += 1print(f" 保存 {saved_count} 个课程到数据库")return coursesreturn []except Exception as e:print(f" 爬取失败 (尝试 {attempt + 1}/{self.config['retry_times']}): {e}")traceback.print_exc()if attempt < self.config["retry_times"] - 1:time.sleep(3)# 重启驱动if not self.restart_driver_if_needed():print(" 无法重启驱动")return []continueelse:return []return []def crawl_courses_simple(self):"""简化版爬取 - 更稳定"""print("=" * 60)print("开始爬取中国大学MOOC课程信息 - 简化版")print("=" * 60)# 启动浏览器if not self.setup_driver():return []try:# 加载Cookie(不强求)self.load_cookies_safe()all_courses = []# 准备要爬取的URLurls_to_crawl = [("https://www.icourse163.org/", "首页推荐"),("https://www.icourse163.org/category/computer", "计算机"),("https://www.icourse163.org/search.htm?search=python", "Python"),("https://www.icourse163.org/search.htm?search=数据分析", "数据分析"),]for i, (url, category) in enumerate(urls_to_crawl[:self.config["max_pages"]]):print(f"\n{'=' * 40}")print(f"进度: {i + 1}/{min(len(urls_to_crawl), self.config['max_pages'])}")courses = self.crawl_single_page_safe(url, category)all_courses.extend(courses)# 避免请求过快if i < len(urls_to_crawl) - 1:time.sleep(2)print(f"\n 爬取完成!总共获取 {len(all_courses)} 个课程")# 保存到文件if all_courses:self.save_output_files(all_courses)self.print_statistics(all_courses)return all_coursesexcept Exception as e:print(f" 爬取过程中出错: {e}")traceback.print_exc()return []finally:self.close_driver()self.close_database()def save_to_database(self, course_data):"""保存课程数据到数据库"""try:insert_sql = """INSERT OR REPLACE INTO courses (course_id, course_name, university, teacher, team, participants, schedule, description, url, category)VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""self.cursor.execute(insert_sql, (course_data.get('course_id', ''),course_data.get('course_name', ''),course_data.get('university', ''),course_data.get('teacher', ''),course_data.get('team', ''),course_data.get('participants', 0),course_data.get('schedule', ''),course_data.get('description', ''),course_data.get('url', ''),course_data.get('category', '')))self.conn.commit()return Trueexcept Exception as e:print(f" 保存到数据库失败: {e}")self.conn.rollback()return Falsedef save_output_files(self, courses):"""保存输出文件"""# CSV文件csv_file = os.path.join(self.config["output_dir"], "mooc_courses.csv")self.save_to_csv(courses, csv_file)# Excel文件excel_file = os.path.join(self.config["output_dir"], "mooc_courses.xlsx")self.save_to_excel(courses, excel_file)# 文本文件text_file = os.path.join(self.config["output_dir"], "courses_summary.txt")self.save_summary(courses, text_file)def save_to_csv(self, courses, filename):"""保存为CSV"""if not courses:return Falsetry:headers = ["Id", "课程号", "课程名称", "学校名称", "主讲教师", "团队成员", "参加人数", "课程进度","课程简介", "课程链接", "分类"]with open(filename, 'w', newline='', encoding='utf-8-sig') as f:writer = csv.writer(f)writer.writerow(headers)for i, course in enumerate(courses, 1):writer.writerow([i,course.get('course_id', ''),course.get('course_name', ''),course.get('university', ''),course.get('teacher', ''),course.get('team', ''),course.get('participants', 0),course.get('schedule', ''),course.get('description', ''),course.get('url', ''),course.get('category', '')])print(f" CSV文件已保存: {filename}")return Trueexcept Exception as e:print(f" 保存CSV失败: {e}")return Falsedef save_to_excel(self, courses, filename):"""保存为Excel"""if not courses:return Falsetry:import pandas as pddata = []for i, course in enumerate(courses, 1):data.append({"Id": i,"课程号": course.get('course_id', ''),"课程名称": course.get('course_name', ''),"学校名称": course.get('university', ''),"主讲教师": course.get('teacher', ''),"团队成员": course.get('team', ''),"参加人数": course.get('participants', 0),"课程进度": course.get('schedule', ''),"课程简介": course.get('description', ''),"课程链接": course.get('url', ''),"分类": course.get('category', '')})df = pd.DataFrame(data)df.to_excel(filename, index=False)print(f" Excel文件已保存: {filename}")return Trueexcept ImportError:print(" 需要安装pandas: pip install pandas")return Falseexcept Exception as e:print(f" 保存Excel失败: {e}")return Falsedef save_summary(self, courses, filename):"""保存摘要"""with open(filename, 'w', encoding='utf-8') as f:f.write(f"中国大学MOOC课程数据摘要\n")f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")f.write(f"课程总数: {len(courses)}\n\n")for i, course in enumerate(courses, 1):f.write(f"{i}. {course.get('course_name')}\n")f.write(f"   学校: {course.get('university')}\n")f.write(f"   教师: {course.get('teacher')}\n")f.write(f"   参加人数: {course.get('participants')}\n")f.write(f"   进度: {course.get('schedule')}\n")f.write(f"   分类: {course.get('category')}\n")f.write(f"   简介: {course.get('description')[:100]}...\n")f.write(f"   链接: {course.get('url')}\n")f.write("-" * 60 + "\n")print(f" 摘要文件已保存: {filename}")def print_statistics(self, courses):"""打印统计信息"""print("\n" + "=" * 60)print("统计信息")print("=" * 60)if not courses:print("没有数据")returnprint(f" 课程总数: {len(courses)}")# 按分类统计categories = {}for course in courses:cat = course.get('category', '未知')categories[cat] = categories.get(cat, 0) + 1print(f"\n 分类分布:")for cat, count in categories.items():print(f"  {cat}: {count} 门课程")# 热门课程if courses:sorted_by_popularity = sorted(courses, key=lambda x: x.get('participants', 0), reverse=True)[:5]print(f"\n 热门课程 (按参加人数):")for i, course in enumerate(sorted_by_popularity, 1):print(f"  {i}. {course.get('course_name')}")print(f"     学校: {course.get('university')}")print(f"     人数: {course.get('participants', 0):,} 人")print(f"\n 数据已保存到 {self.config['output_dir']} 目录")def close_driver(self):"""关闭浏览器"""if self.driver:try:self.driver.quit()print(" 浏览器已关闭")except:passdef close_database(self):"""关闭数据库"""if self.conn:try:self.conn.close()print(" 数据库连接已关闭")except:passdef main():"""主函数"""print("=" * 60)print("中国大学MOOC课程信息爬虫 - 稳定版")print("=" * 60)# 配置config = {"chrome_driver_path": r"C:\Program Files\Google\Chrome\Application\chromedriver.exe","cookie_file": "icourse_cookies.json","max_pages": 3,  # 只爬3页测试"headless": False,}crawler = MOOCCourseCrawlerFixed(config)print("\n 正在爬取课程信息...")courses = crawler.crawl_courses_simple()if courses:print("\n" + "=" * 60)print("爬取完成!")print("=" * 60)# 显示前几个课程print(f"\n前5个课程:")for i, course in enumerate(courses[:5], 1):print(f"\n{i}. {course.get('course_name')}")print(f"   学校: {course.get('university')}")print(f"   教师: {course.get('teacher')}")print(f"   人数: {course.get('participants'):,}")print(f"   进度: {course.get('schedule')}")print(f"   链接: {course.get('url')}")else:print("\n 没有爬取到课程数据")if __name__ == "__main__":main()
结果截图:

image
gitee链接:
https://gitee.com/yan-tao2380465352/2025_crawl_project/blob/master/第四次实践作业_mooc_course_crawler.py
(2)心得体会:理解了显式等待与隐式等待的区别,学会了合理设置等待时间以应对网络延迟;学会了如何获取、保存和复用Cookie,实现免密码登录;理解了Web会话机制,掌握了维持登录状态的方法。
3.
任务一:Python脚本生成测试数据:

图片17
任务二:配置Kafka

图片19
任务三: 安装Flume客户端

图片21
任务四:配置Flume采集数据:

图片23

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.mzph.cn/news/995390.shtml

如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈email:809451989@qq.com,一经查实,立即删除!

相关文章

02 对象及数字对象

什么是对象对象就是语言描述中涉及的物体 Python语言中,所有的数据都被称之为对象

雅思培训机构怎么选?2025年这5家高性价比机构值得关注

雅思培训机构怎么选?2025年这5家高性价比机构值得关注一、选对机构:雅思备考的关键第一步 在雅思备考这场战役中,选择一家合适的培训机构,就如同为自己挑选了一位得力的军师,能为你的备考之路指明方向,提供有效的…

kettle9.0 多库多表同步数据

一、需求:有30个库,每个库有53张表,需要将表中数据完整的同步到另一个库中,做备份用 1、主作业:读取数据库表信息,循环每个库。 2、次作业:读取库中的所有表,循环每个表。 // 作业级读取变量:用 parent_…

从意义行为原生到技术实现:现实情境编译器作为AI元人文的范式革命

从意义行为原生到技术实现:现实情境编译器作为AI元人文的范式革命 我们以“现实情境编译器”为核心基石,对AI元人文理论的未来发展方向进行重构与深化,形成以下五个关键的演进方向。 方向一:哲学基础的深化 — 从“…

2025年靠谱的光伏直流汇流箱/光伏汇流箱厂家最新权威推荐排行榜

2025年靠谱的光伏直流汇流箱/光伏汇流箱厂家权威推荐排行榜行业背景与市场趋势随着全球能源结构转型加速推进,光伏发电作为清洁能源的重要组成部分,正迎来前所未有的发展机遇。据国际能源署(IEA)预测,到2025年全球…

2025雅思培训机构高分逆袭指南:精准推荐+避坑选课全攻略

2025雅思培训机构高分逆袭指南:精准推荐+避坑选课全攻略“雅思刷分3次仍卡5.5”“选错机构浪费半年时间”“机考操作不熟练丢分”——这些备考痛点,正在让无数留学生和移民申请者陷入焦虑。2025年雅思考试机考普及、…

修炼金丹来培养元神的气功

修炼金丹来培养元神的气功 盘古开天劈地,天地之间便诞生出先天之气和先天灵气。先天之气可以滋养万物,先天紫气可以带给万物以生命,灵力。我们感受天地之间游离的一丝先天之气,缓慢呼吸,吸入这些先天之气,想象其…

2025年质量好的箱涵管道清淤机器人/煤矿井下清淤机器人厂家最新权威推荐排行榜

2025年质量好的箱涵管道清淤机器人/煤矿井下清淤机器人厂家权威推荐排行榜行业背景与市场趋势随着我国城市化进程加快和工业发展水平提升,市政管网、工业沉淀池、煤矿井下等场所的清淤需求日益增长。传统人工清淤方式…

2025年口碑好的双胞胎婴儿车国货

2025年口碑好的双胞胎婴儿车国货:专业推荐与采购指南行业背景与市场趋势随着中国生育政策的持续优化和家庭消费能力的提升,婴童用品市场迎来了新一轮增长。根据中国婴童产业研究中心数据显示,2024年中国婴儿车市场规…

vue-dawn-flow 低代码流程插件

一、vue-dawn-flow介绍 vue-dawn-flow是一款功能强大的开源流程设计器,专为 Vue.js 生态打造,完美兼容 Vue 2 和 Vue 3 框架。并且能很好的兼容vue前端所有框架。 1.1插件功能提供了一个可视化的流程设计器,你可以在…

百日挑战——单词篇(第二十天) - 指南

百日挑战——单词篇(第二十天) - 指南2025-12-09 18:51 tlnshuju 阅读(0) 评论(0) 收藏 举报pre { white-space: pre !important; word-wrap: normal !important; overflow-x: auto !important; display: block …

洛谷U639786 树的颜色询问 题解 树上启发式合并(dsu on tree)

题目链接:https://www.luogu.com.cn/problem/U639786 题目大意: 树上每个节点有一个颜色,节点 \(i\) 的颜色是 \(c_i\)。 对于每个节点,求它所在的子树中出现次数最多的颜色出现了多少次。 解题思路: 树上启发式合…

2025年热门的牛羊肉贴体膜/贴体膜厂家最新实力排行

2025年热门的牛羊肉贴体膜/贴体膜厂家实力排行行业背景与市场趋势随着全球生鲜食品贸易的持续增长和消费者对食品安全、保鲜要求的不断提高,贴体膜包装技术近年来获得了快速发展。特别是在牛羊肉等高价值生鲜食品领域…

Webpack与Vite的常用设置及主要差异分析

pre { white-space: pre !important; word-wrap: normal !important; overflow-x: auto !important; display: block !important; font-family: "Consolas", "Monaco", "Courier New", …

2025 年雅思培训口碑机构 TOP5 推荐

2025 年雅思培训口碑机构 TOP5 推荐一、2025 年雅思培训口碑机构 TOP5 推荐 (一)短期冲分王者:无老师国际 封闭班提分效率行业领先,顶尖师资 智能体系双重保障。作为覆盖全国 56 个城市的高端品牌,无老师国际封…

灵光网页版AI助手,特斯拉集成Grok语音导航,阿里Qwen3-TTS横空出世

🌟 蚂蚁集团推出灵光网页版AI助手:30秒打造个性化小应用全浏览器操作:无需下载,直接通过电脑或手机浏览器访问,实现“灵光对话”与“灵光闪应用”双功能。 自然语言生成应用:输入需求即可30秒内生成、编辑并管理…

软件工程学习日志2025.12.9

📖 今日学习内容概述 根据给定的需求描述,设计了一个支持多仓库管理的仓储管理系统原型,重点完成了数据库结构设计和业务开发框架的搭建。 🧠 核心知识要点数据库设计要点• 物资唯一性约束:通过(物资名称, 规格…

2025年口碑好的网架工程/徐州煤棚网架厂家选购指南与推荐

2025年口碑好的网架工程/徐州煤棚网架厂家选购指南与推荐行业背景与市场趋势随着我国工业化和城市化进程的不断推进,网架结构作为一种高效、经济、环保的建筑形式,在工业厂房、体育场馆、机场航站楼、电厂干煤棚等大…

2025雅思培训机构怎么选?这篇攻略帮你避坑+精准提分!

2025雅思培训机构怎么选?这篇攻略帮你避坑+精准提分!一、选机构前必做的 2 件事:先搞懂自己,再挑对方向 在雅思备考的征程中,选对培训机构就像为航船找准方向,至关重要。但在一头扎进机构对比前,我们得先做好两…

2025年热门的格栅机耙齿用户口碑最好的厂家榜

2025年热门的格栅机耙齿用户口碑的厂家榜 行业背景与市场趋势 随着环保政策的日益严格和污水处理技术的不断升级,格栅机耙齿作为污水处理前端的关键设备,市场需求持续增长。格栅机主要用于拦截和清除污水中的固体悬…