大家好,我是烤鸭:
某音竟然有pc版了,不过搜索的数据有限,会限制条数,亲测只能搜索400条数据,简单分析下过程。
工具使用
java + chromedriver + fiddler
java + selenium 自动化网页,需要登录,可以登录一次共享cookie
@Test
public void testXyin() {String keyWord = "旅游";try {// 调用chrome driverSystem.setProperty("webdriver.chrome.driver", "D:\\dev\\env\\chromedriver\\chromedriver.exe");// 共享cookie// ChromeOptionsChromeOptions chromeOptions = new ChromeOptions();// 添加用户cookieschromeOptions.addArguments("--user-data-dir=C:\\Users\\user\\AppData\\Local\\Google\\Chrome\\User Data-Cookie");WebDriver driver = new ChromeDriver(chromeOptions);// 窗口最大化driver.manage().window().maximize();driver.get("https://www.douyin.com/search/"+ keyWord+ "?publish_time=0&sort_type=0&source=normal_search&type=general");// 调整高度((ChromeDriver) driver).executeScript("window.scrollTo(0, document.body.scrollHeight);");Thread.sleep(1000);// 构建driver对象driver.manage().timeouts().implicitlyWait(3, TimeUnit.SECONDS);WebElement webElement = driver.findElement(By.cssSelector("body"));webElement.click(); // 有的时候必须点击一下,下拉才能生效(有的网站是这样,原因未找到)} catch (Exception e) {e.printStackTrace();}
}
fiddler 脚本
修改 Fiddler,Rules—>Customize Rules, 改写 OnBeforeResponse 方法
static function OnBeforeResponse(oSession: Session) {if (m_Hide304s && oSession.responseCode == 304) {oSession["ui-hide"] = "true";}//加在方法末尾if (oSession.HostnameIs("www.douyin.com") && oSession.uriContains("https://www.douyin.com/aweme/v1/web/general/search/single")){var filename = "D:\\data\\dy\\fiddler-token.log";var curDate = new Date();var logContent = "[" + curDate.toLocaleString() + "] " + oSession.GetRequestBodyAsString() + "\r\n"+oSession.GetResponseBodyAsString()+"\r\n";var sw : System.IO.StreamWriter;if (System.IO.File.Exists(filename)){sw = System.IO.File.AppendText(filename);sw.Write(logContent);}else{sw = System.IO.File.CreateText(filename);sw.Write(logContent);}sw.Close();sw.Dispose();}}
解析数据
读取文件解析:
public void readText() {ReaderTxt rt = new ReaderTxt();ArrayList<String> list = rt.InitTxt();for (int i = 0; i < list.size(); i++) {String txt = list.get(i);if (!txt.startsWith("{")) {continue;}JSONObject jrs = JSONObject.parseObject(txt);JSONArray array = jrs.getJSONArray("data");for (Object obs : array) {DyScrapVideo scrapVideo = new DyScrapVideo();JSONObject json = (JSONObject) obs;// aweme_infoJSONObject awemeInfo = json.getJSONObject("aweme_info");if (!Optional.ofNullable(awemeInfo).isPresent()) {continue;}// https://www.douyin.com/video/ + aweme_id 详情页String aweme_id = awemeInfo.getString("aweme_id");String desc = awemeInfo.getString("desc");Long publishTime = awemeInfo.getLong("create_time");scrapVideo.setVideoDesc(desc);scrapVideo.setAwemeId(aweme_id);scrapVideo.setVideoPublishTime(UnixUtil.TimeStamp2Date(publishTime + ""));// authorJSONObject author = awemeInfo.getJSONObject("author");Long aLong = author.getLong("uid");String nickname = author.getString("nickname");String signature = author.getString("signature");scrapVideo.setAuthorUid(aLong + "");scrapVideo.setAuthorNickname(nickname);scrapVideo.setAuthorSignature(signature);JSONObject avatar_thumb = author.getJSONObject("avatar_thumb");JSONArray url_list = avatar_thumb.getJSONArray("url_list");if (Optional.ofNullable(url_list).isPresent()) {scrapVideo.setAuthorAvatarThumb(url_list.get(0).toString());}Long follower_count = author.getLong("follower_count");scrapVideo.setFollowerCount(follower_count != null ? follower_count.intValue() : 0);String custom_verify = author.getString("custom_verify");scrapVideo.setCustomVerify(custom_verify);// videoJSONObject video = awemeInfo.getJSONObject("video");if(video != null){JSONObject download_addr = video.getJSONObject("download_addr");if(download_addr != null){JSONArray down_url_list = download_addr.getJSONArray("url_list");if (Optional.ofNullable(down_url_list).isPresent()) {scrapVideo.setVideoDownloadAddr(UnicodeUtil.unicodeToCN(down_url_list.get(0).toString()));}}Integer duration = video.getInteger("duration");scrapVideo.setVideoDuration(duration);}// statisticsJSONObject statistics = awemeInfo.getJSONObject("statistics");if(statistics != null){Integer comment_count = statistics.getInteger("comment_count");Integer digg_count = statistics.getInteger("digg_count");Integer download_count = statistics.getInteger("download_count");Integer play_count = statistics.getInteger("play_count");Integer share_count = statistics.getInteger("share_count");Integer collect_count = statistics.getInteger("collect_count");scrapVideo.setCommentCount(comment_count);scrapVideo.setDiggCount(digg_count);scrapVideo.setDownloadCount(download_count);scrapVideo.setPlayCount(play_count);scrapVideo.setShareCount(share_count);scrapVideo.setCollectCount(collect_count);}scrapVideo.setCreateDate(new Date());scrapVideo.setSearchKeyword("北京旅游");}}
}public ArrayList<String> InitTxt() {ArrayList<String> list = new ArrayList<String>();try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw/* 读入TXT文件 */String pathname ="D:\\data\\fiddler-token.log"; // 绝对路径或相对路径都可以,这里是绝对路径,写入文件时演示相对路径File filename = new File(pathname);InputStreamReader reader =new InputStreamReader(new FileInputStream(filename), "utf-8"); // 建立一个输入流对象readerBufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言String line = "";while (line != null) {line = br.readLine(); // 一次读入一行数据if (line == null) {break;}list.add(line);}} catch (Exception e) {e.printStackTrace();}return list;
}
实体对象:
package com.machu.picchu.crawler.dto;import java.util.Date;public class DyScrapVideo {private Integer id;private String awemeId;private String videoDesc;private Date videoPublishTime;private String videoDownloadAddr;private Integer videoDuration;private Integer commentCount;private Integer diggCount;private Integer playCount;private Integer downloadCount;private Integer shareCount;private Integer collectCount;private String authorUid;private String authorNickname;private String authorSignature;private String authorAvatarThumb;private Integer followerCount;private String customVerify;private Date createDate;private Date publishDate;private String searchKeyword;private String memo;private Integer status;public Integer getId() {return id;}public void setId(Integer id) {this.id = id;}public String getVideoDesc() {return videoDesc;}public void setVideoDesc(String videoDesc) {this.videoDesc = videoDesc;}public Date getVideoPublishTime() {return videoPublishTime;}public void setVideoPublishTime(Date videoPublishTime) {this.videoPublishTime = videoPublishTime;}public String getVideoDownloadAddr() {return videoDownloadAddr;}public void setVideoDownloadAddr(String videoDownloadAddr) {this.videoDownloadAddr = videoDownloadAddr;}public Integer getVideoDuration() {return videoDuration;}public void setVideoDuration(Integer videoDuration) {this.videoDuration = videoDuration;}public Integer getCommentCount() {return commentCount;}public void setCommentCount(Integer commentCount) {this.commentCount = commentCount;}public Integer getDiggCount() {return diggCount;}public void setDiggCount(Integer diggCount) {this.diggCount = diggCount;}public Integer getPlayCount() {return playCount;}public void setPlayCount(Integer playCount) {this.playCount = playCount;}public Integer getDownloadCount() {return downloadCount;}public void setDownloadCount(Integer downloadCount) {this.downloadCount = downloadCount;}public Integer getShareCount() {return shareCount;}public void setShareCount(Integer shareCount) {this.shareCount = shareCount;}public Integer getCollectCount() {return collectCount;}public void setCollectCount(Integer collectCount) {this.collectCount = collectCount;}public String getAuthorUid() {return authorUid;}public void setAuthorUid(String authorUid) {this.authorUid = authorUid;}public String getAuthorNickname() {return authorNickname;}public void setAuthorNickname(String authorNickname) {this.authorNickname = authorNickname;}public String getAuthorSignature() {return authorSignature;}public void setAuthorSignature(String authorSignature) {this.authorSignature = authorSignature;}public String getAuthorAvatarThumb() {return authorAvatarThumb;}public void setAuthorAvatarThumb(String authorAvatarThumb) {this.authorAvatarThumb = authorAvatarThumb;}public Integer getFollowerCount() {return followerCount;}public void setFollowerCount(Integer followerCount) {this.followerCount = followerCount;}public String getCustomVerify() {return customVerify;}public void setCustomVerify(String customVerify) {this.customVerify = customVerify;}public Date getCreateDate() {return createDate;}public void setCreateDate(Date createDate) {this.createDate = createDate;}public Date getPublishDate() {return publishDate;}public void setPublishDate(Date publishDate) {this.publishDate = publishDate;}public String getSearchKeyword() {return searchKeyword;}public void setSearchKeyword(String searchKeyword) {this.searchKeyword = searchKeyword;}public String getMemo() {return memo;}public void setMemo(String memo) {this.memo = memo;}public Integer getStatus() {return status;}public void setStatus(Integer status) {this.status = status;}public String getAwemeId() {return awemeId;}public void setAwemeId(String awemeId) {this.awemeId = awemeId;}
}