通过爬虫爬取的数据保存为JSON格式,然后将JSON数据清洗,得到想要的数据
package com.example.jingd;import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;public class CommentCleanerSimple {public static void main(String[] args) {String inputFilePath = "src/main/resources/jd_comments_raw_20251026_135759.json";String outputFilePath = "target/cleaned_comments.csv";ObjectMapper objectMapper = new ObjectMapper();try {// 确保输出目录存在Files.createDirectories(Paths.get("target"));// 创建输入输出流BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));BufferedWriter writer = new BufferedWriter(new FileWriter(outputFilePath));// 写入CSV标题行writer.write("用户名,评论ID,评分,评论日期,评论内容,用户等级,购买型号,购买次数,购买信息,产品规格\n");String line;while ((line = reader.readLine()) != null) {try {// 解析JSON数据JsonNode jsonNode = objectMapper.readTree(line);// 提取需要的字段String userNickName = jsonNode.has("userNickName") ? jsonNode.get("userNickName").asText() : "";String commentId = jsonNode.has("commentId") ? jsonNode.get("commentId").asText() : "";String commentScore = jsonNode.has("commentScore") ? jsonNode.get("commentScore").asText() : "";String commentDate = jsonNode.has("commentDate") ? jsonNode.get("commentDate").asText() : "";// 处理评论内容:移除特殊字符、换行符,将英文逗号替换为中文逗号String commentData = "";if (jsonNode.has("commentData")) {// 简单直接的字符串处理方式commentData = jsonNode.get("commentData").asText().replace('\n', ' ') // 替换换行符为空格.replace('\r', ' ') // 替换回车符为空格.replace(',', ',') // 将英文逗号替换为中文逗号.replaceAll("[\\\\/\\*\\(\\)\\[\\]\\{\\}\\^\\$\\|\\?\\+\\.\\<\\>\\!\\@\\#\\%\\^\\&\\*]", "") // 移除常见特殊字符.trim(); // 去除首尾空白}// 提取用户等级String userLevel = jsonNode.has("officerLevel") ? jsonNode.get("officerLevel").asText() : "";// 提取购买型号(从wareAttribute数组中获取)String productModel = "";if (jsonNode.has("wareAttribute") && jsonNode.get("wareAttribute").isArray()) {for (JsonNode attr : jsonNode.get("wareAttribute")) {if (attr.has("型号")) {productModel = attr.get("型号").asText();break;}}}// 提取购买次数String buyCount = jsonNode.has("buyCount") ? jsonNode.get("buyCount").asText() : "";// 提取购买信息文本String buyCountText = jsonNode.has("buyCountText") ? jsonNode.get("buyCountText").asText().replaceAll(",", ",") : "";// 提取产品规格信息String productSpecifications = jsonNode.has("productSpecifications") ? jsonNode.get("productSpecifications").asText().replaceAll(",", ",") : "";// 构建CSV行String csvLine = String.join(",", userNickName, commentId, commentScore, commentDate, commentData,userLevel, productModel, buyCount, buyCountText, productSpecifications);writer.write(csvLine + "\n");} catch (Exception e) {System.err.println("解析JSON失败: " + e.getMessage());e.printStackTrace();}}reader.close();writer.close();System.out.println("数据清洗完成!结果保存在: " + outputFilePath);} catch (IOException e) {System.err.println("文件操作失败: " + e.getMessage());e.printStackTrace();}}
}
然后得到.csv数据文件
