目录  1. 什么是Ajax   2. Ajax分析方法   3. Ajax分析与爬取实战 3.1 爬取目标 3.2 初步探索 3.3 爬取列表页 分析 实现 基础配置 爬取页面内容(获取页面的JSON内容) 爬取列表页(爬取指定列表页) 合并 3.4 爬取详情页   3.5 保存数据(MongoDB)(后期补充)   
 
使用requests获取的是原始HTML文档 浏览器中的页面是JavaScript处理数据后生成的结果 数据的来源 通过Ajax加载 包含在HTML文档中 经过JavaScript和特定算法计算后生成  Ajax加载数据 方式:异步 原始页面最初不包含某些数据 当原始页面加载成功后,再向服务器请求某个接口获取数据   然后将数据处理并呈现在网页上   Ajax(Asynchronous JavaScript and XML) :异步的 JavaScript 和 XML 不是一门编程语言 利用 JavaScript 在保证页面不被刷新、页面链接不改变的情况下与服务器交换数据并更新部分网页的技术 下滑查看更多 下滑后加载的动画:Ajax 加载的过程 var  xmlhttp; 
if  ( window. XMLHttpRequest)  { xmlhttp =  new  XMLHttpRequest ( ) ; 
}  else  { xmlhttp =  new  ActiveXObject ( "Microsoft.XMLHTTP" ) ; 
} 
xmlhttp. onreadystatechange  =  function ( )  { if  ( xmlhttp. readyState ==  4  &&  xmlhttp. status ==  200 )  { document. getElementById ( "myDiv" ) . innerHTML =  xmlhttp. responseText; } 
} 
xmlhttp. open ( "POST" ,  "/ajax/" ,  true ) ; 
xmlhttp. send ( ) ; 
var  xmlhttp; 
if  ( window. XMLHttpRequest)  { xmlhttp =  new  XMLHttpRequest ( ) ; 
}  else  { xmlhttp =  new  ActiveXObject ( "Microsoft.XMLHTTP" ) ; 
} xmlhttp. onreadystatechange  =  function ( )  { if  ( xmlhttp. readyState ==  4  &&  xmlhttp. status ==  200 )  { document. getElementById ( "myDiv" ) . innerHTML =  xmlhttp. responseText; } 
} xmlhttp. open ( "POST" ,  "/ajax/" ,  true ) ; 
xmlhttp. send ( ) ; 
var  xmlhttp; 
if  ( window. XMLHttpRequest)  { xmlhttp =  new  XMLHttpRequest ( ) ; 
}  else  { xmlhttp =  new  ActiveXObject ( "Microsoft.XMLHTTP" ) ; 
} xmlhttp. onreadystatechange  =  function ( )  { if  ( xmlhttp. readyState ==  4  &&  xmlhttp. status ==  200 )  { document. getElementById ( "myDiv" ) . innerHTML =  xmlhttp. responseText; } 
} xmlhttp. open ( "POST" ,  "/ajax/" ,  true ) ; 
xmlhttp. send ( ) ; 
微博 (weibo.cn) Ajax 的请求类型:xhr 如果 Request Headers 中有一个信息为XMLHttpRequest,则此请求就是 Ajax 请求   可以在 Preview 查看响应的内容 也可以在 Response 查看真实返回的数据 点击 Network 中的 XHR 选项,显示所有的 Ajax 请求 爬取 Scrape | Movie 与 2.5(Scrape | Movie)不同 数据请求是通过 Ajax 实现的 页面内容通过 JavaScript 渲染出来的 只是呈现样式是一样的   爬取电影的名称、封面、类别、上映时间、评分、剧情简介等内容 分析页面数据的加载逻辑 用 requests 实现 Ajax 数据的爬取 将每部电影数据分别保存到 MongoDB 数据库 import  requestsurl =  "https://spa1.scrape.center/" 
html =  requests. get( url) . textprint ( html) 
获取到的 html 资源较少 整个页面都是JavaScript渲染得到的,浏览器执行了HTML中引用的JavaScript文件,JavaScript通过调用一些数据加载和页面渲染方法,才最终呈现出浏览器中的显示效果 数据一般是通过Ajax加载的,JavaScript在后台调用Ajax数据接口 请求URL的limit恒定为10,offset为已经已经翻过的电影数量( ( 当前页数 − 1 ) ∗ 10 (当前页数-1)*10  ( 当前页数 − 1 ) ∗ 10  根据响应内容可以发现所需数据皆在其中 import  logginglogging. basicConfig( level= logging. INFO, format = '%(asctime)s %(levelname)s: %(message)s' ) INDEX_URL =  "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}" 
import  requestsdef  scrape_api ( url) : logging. info( f"scraping  { url} ..." ) try : response =  requests. get( url) if  response. status_code ==  200 : return  response. json( ) logging. error( f"Status code:  { response. status_code}  while scraping  { url} " ) except  requests. RequestException: logging. error( f"Error while scraping  { url} " ,  exc_info= True ) 
LIMIT =  10 def  scrape_index ( page) : url =  INDEX_URL. format ( limit= LIMIT,  offset= LIMIT *  ( page -  1 ) ) return  scrape_api( url) 
import  logging
import  requestslogging. basicConfig( level= logging. INFO, format = '%(asctime)s %(levelname)s: %(message)s' ) INDEX_URL =  "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}" 
LIMIT =  10 def  scrape_api ( url) : logging. info( f"scraping  { url} ..." ) try : response =  requests. get( url) if  response. status_code ==  200 : return  response. json( ) logging. error( f"Status code:  { response. status_code}  while scraping  { url} " ) except  requests. RequestException: logging. error( f"Error while scraping  { url} " ,  exc_info= True ) def  scrape_index ( page) : url =  INDEX_URL. format ( limit= LIMIT,  offset= LIMIT *  ( page -  1 ) ) return  scrape_api( url) 
url:最后的一个参数为此电影的id 电影的id:Ajax请求返回的数据中含有电影对应的id DETAIL_URL =  "https://spa1.scrape.center/api/movie/{id}" def  scrape_detail ( id ) : url =  DETAIL_URL. format ( id = id ) return  scrape_api( url) 
TOTAL_PAGE =  10 def  main ( ) : for  page in  range ( 1 ,  TOTAL_PAGE +  1 ) : index_data =  scrape_index( page) for  item in  index_data. get( "results" ) : id  =  item. get( "id" ) detail_data =  scrape_detail( id ) logging. info( f"detail data  { detail_data} " ) if  __name__ ==  "__main__" : main( ) 
import  logging
import  requestslogging. basicConfig( level= logging. INFO, format = '%(asctime)s %(levelname)s: %(message)s' ) INDEX_URL =  "https://spa1.scrape.center/api/movie/?limit={limit}&offset={offset}" 
DETAIL_URL =  "https://spa1.scrape.center/api/movie/{id}" 
LIMIT =  10 
TOTAL_PAGE =  10 def  scrape_api ( url) : logging. info( f"scraping  { url} ..." ) try : response =  requests. get( url) if  response. status_code ==  200 : return  response. json( ) logging. error( f"Status code:  { response. status_code}  while scraping  { url} " ) except  requests. RequestException: logging. error( f"Error while scraping  { url} " ,  exc_info= True ) def  scrape_index ( page) : url =  INDEX_URL. format ( limit= LIMIT,  offset= LIMIT *  ( page -  1 ) ) return  scrape_api( url) def  scrape_detail ( id ) : url =  DETAIL_URL. format ( id = id ) return  scrape_api( url) def  main ( ) : for  page in  range ( 1 ,  TOTAL_PAGE +  1 ) : index_data =  scrape_index( page) for  item in  index_data. get( "results" ) : id  =  item. get( "id" ) detail_data =  scrape_detail( id ) logging. info( f"detail data  { detail_data} " ) if  __name__ ==  "__main__" : main( )