功能需求
将html字符串保存为.mhtml文件
代码实现
- pom.xml依赖
<dependencies> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-web</artifactId> </dependency> <!-- https://mvnrepository.com/artifact/cn.hutool/hutool-all --> <dependency> <groupId>cn.hutool</groupId> <artifactId>hutool-all</artifactId> <version>5.8.43</version> </dependency> <!-- Jsoup:解析HTML标签、提取图片/样式资源,必备 --> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.17.2</version> </dependency> <!-- Apache工具包:Base64编码图片资源、IO流处理,必备 --> <!-- Source: https://mvnrepository.com/artifact/commons-codec/commons-codec --> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.15</version> <scope>compile</scope> </dependency> <!-- Source: https://mvnrepository.com/artifact/commons-io/commons-io --> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.15.1</version> <scope>compile</scope> </dependency> <!-- Source: https://mvnrepository.com/artifact/org.projectlombok/lombok --> <dependency> <groupId>org.projectlombok</groupId> <artifactId>lombok</artifactId> <version>1.18.38</version> <scope>compile</scope> </dependency> </dependencies>- 获取通过访问url获取html字符串内容工具类
@Slf4j public class WikiUtils { /** * 获取wiki 页面html */ public static String getConfluencePageHtml(String url,String cookie) { String value = ""; HttpResponse httpResponse = HttpClient.httpGetResponse(url, cookie); if (httpResponse.isOk()){ value = httpResponse.body(); }else if (httpResponse.getStatus() == 403|| httpResponse.getStatus() == 302){ log.error("无效的cookie,无权限访问"); }else { log.error("获取html页面失败"); } return value; } /** * 在请求头中放入cookie,避免登录拦截 */ public static HttpResponse httpGetResponse(String url,String cookie) { Map<String, String> headers = new HashMap<>(); headers.put("Cookie", cookie); //登录 HttpResponse response = HttpRequest.get(url).headerMap(headers, true).execute(); return response; } }- Html转换.mhtml核心类
@Slf4j public class Html2MHTCompiler { public static String parseTittle(String html) { Document doc = Jsoup.parse(html); Element titleElement = doc.selectFirst("title"); if (titleElement != null) { String text = titleElement.text(); int i = text.indexOf("-"); if (i > 0) { return text.substring(0, i).trim(); } return text.trim(); } return null; } // 原资源URL -> 资源的Base64编码(带MIME头) public static Map<String, String> parseHtmlPage(String cookie,String html, String baseUrl) { Map<String, String> resourceMap = new HashMap<>(); Document doc = Jsoup.parse(html); // ========== 1. 提取所有 img 图片资源 ========== Elements imgElements = doc.select("img[src]"); for (Element imgElement : imgElements) { String imgSrc = imgElement.attr("src"); parseResource(cookie,imgSrc,"image",baseUrl, resourceMap); } // ========== 2. 提取所有 link 外链CSS样式表资源========== Elements cssElements = doc.select("link[rel=stylesheet][href]"); for (Element cssElement : cssElements) { String cssHref = cssElement.attr("href"); parseResource(cookie,cssHref, "CSS",baseUrl, resourceMap); } // ========== 3. 提取所有 script 外链JS脚本资源 ========== Elements jsElements = doc.select("script[src]"); for (Element jsElement : jsElements) { String jsSrc = jsElement.attr("src"); parseResource(cookie,jsSrc,"javascript",baseUrl, resourceMap); } return resourceMap; } // ========== 删除部分元素class="acs-side-bar ia-scrollable-section" 、 // class="ia-splitter-left"、 // id="header" // id="navigation" // id="likes-and-labels-container"、 // id="footer" 、 // id="comments-section" // id="page-metadata-banner" // id="breadcrumb-section" // 、id="main"的style="margin-left: 285px;" ========== public static String removeUnwantedElements(String html) { Document doc = Jsoup.parse(html); //删除head标签下的style标签的属性中的.ia-splitter-left #main 这两个选择器 removeCssSelectorFromStyleTag(doc, ".ia-splitter-left"); removeCssSelectorFromStyleTag(doc, "#main"); // 1. 删除指定class的元素 → 侧边栏/左侧面板 等冗余区域 doc.select(".acs-side-bar .ia-scrollable-section").remove(); doc.select(".ia-splitter-left").remove(); // 2. 删除指定id的元素 → 点赞标签区、页脚、评论区 等无用模块 // doc.getElementById("likes-and-labels-container").remove(); doc.getElementById("footer").remove(); doc.getElementById("header").remove(); doc.getElementById("navigation").remove(); doc.getElementById("comments-section").remove(); doc.getElementById("page-metadata-banner").remove(); doc.getElementById("breadcrumb-section").remove(); // 3. 精准移除 id="main" 标签中【指定的style样式:margin-left: 285px;】,保留其他style样式 Element mainElement = doc.getElementById("main"); if (mainElement != null && mainElement.hasAttr("style")) { // 获取原style属性值 String oldStyle = mainElement.attr("style"); // 移除指定的样式段,保留其他样式 String newStyle = oldStyle.replace("margin-left: 285px;", "").trim(); // 处理移除后style为空的情况,避免残留空的style=""属性 if (newStyle.isEmpty()) { mainElement.removeAttr("style"); } else { mainElement.attr("style", newStyle); } } return doc.html(); } /** * 核心工具方法:删除<head>标签下所有<style>标签内的【指定CSS选择器】及其对应的所有样式 * @param doc jsoup解析后的文档对象 * @param selector 要删除的css选择器,如:.ia-splitter-left 、 #main */ private static void removeCssSelectorFromStyleTag(Document doc, String selector) { // 1. 获取head标签下所有的style样式标签 Elements styleTags = doc.head().select("style"); if (styleTags.isEmpty()) { return; // 没有style标签,直接返回 } // 2. 遍历每一个style标签,处理内部的css内容 for (Element styleTag : styleTags) { String cssContent = styleTag.html(); if (cssContent.isEmpty()) continue; // 3. 精准匹配【选择器 { 任意样式内容 }】 完整块,含换行/空格/制表符,匹配规则全覆盖 // 匹配规则:匹配 .ia-splitter-left { ... } 或 #main { ... } 完整的样式块 String regex = selector + "\\s*\\{[^}]*\\}"; // 替换匹配到的内容为空,即删除该选择器及对应样式 String newCssContent = cssContent.replaceAll(regex, "").trim(); // 处理替换后多余的空行/空格,让css内容更整洁 newCssContent = newCssContent.replaceAll("\\n+", "\n").replaceAll("\\s+", " "); // 4. 将处理后的css内容重新写入style标签 styleTag.html(newCssContent); } } // ========== 图片/CSS/JS都复用这个方法 ========== private static void parseResource(String cookie,String resourceSrc,String resourceType,String baseUrl, Map<String, String> resourceMap) { try { // 拼接完整URL(兼容:绝对路径/相对路径) String fullResourceUrl = getFullUrl(baseUrl, resourceSrc); // 下载资源文件,转成【带MIME头的Base64编码】 String base64Resource = downloadResourceToBase64(fullResourceUrl,resourceType, cookie); resourceMap.put(resourceSrc, base64Resource); } catch (Exception e) { log.error("资源解析失败,跳过该资源:" + resourceSrc, e); } } // 拼接完整URL:处理相对路径/绝对路径 (原有方法,复用) private static String getFullUrl(String baseUrl, String src) { if (src.startsWith("http://") || src.startsWith("https://")) { return src; // 绝对路径,直接返回 } else if(src.startsWith("//")){ return "https:" + src; // 兼容 //xxx.com/xxx.css 这种无协议路径 } else { return src.startsWith("/") ? baseUrl + src : baseUrl + "/" + src; // 相对路径,拼接根路径 } } // ========== 通用资源下载+Base64编码方法,支持【图片/CSS/JS】所有类型 ========== private static String downloadResourceToBase64(String resourceUrl,String resourceType,String cookie) throws Exception { URL url = new URL(resourceUrl); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(5000); conn.setReadTimeout(5000); conn.setRequestMethod("GET"); conn.setRequestProperty("Cookie",cookie); // 解决部分网站的反爬/跨域问题 conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0"); conn.setRequestProperty("Connection", "keep-alive"); conn.setRequestProperty("Accept", "*/*"); if (resourceType.equals("image")){ conn.setRequestProperty("Accept-Encoding", "gzip, deflate"); } if (conn.getResponseCode() == 200) { InputStream in = conn.getInputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream(); byte[] buffer = new byte[1024]; int len; while ((len = in.read(buffer)) != -1) { out.write(buffer, 0, len); } byte[] resourceBytes = out.toByteArray(); // 对图片类型做【体积压缩+无损渲染】处理 if ("image".equalsIgnoreCase(resourceType) && resourceBytes.length > 0) { resourceBytes = compressImage(resourceBytes, 0.7f); // 0.7是压缩质量,可调整 } // 获取资源的MIME类型 + Base64编码,自动适配图片/CSS/JS String mimeType = conn.getContentType(); String base64 = Base64.encodeBase64String(resourceBytes); in.close(); out.close(); conn.disconnect(); // 返回标准的data-url格式,可直接嵌入HTML替换原URL return "data:" + mimeType + ";base64," + base64; } return null; } /** * 核心图片压缩工具方法:图片质量压缩(核心无坑) * @param imageBytes 原图字节流 * @param quality 压缩质量 0.1~1.0 ,推荐0.6~0.8 (数值越大越清晰,体积越大) * @return 压缩后的图片字节流 */ private static byte[] compressImage(byte[] imageBytes, float quality) throws Exception { // 质量值兜底,防止传参错误 if (quality < 0.1f) quality = 0.1f; if (quality > 1.0f) quality = 1.0f; ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes); BufferedImage bufferedImage = ImageIO.read(bais); if (bufferedImage == null) { return imageBytes; // 非标准图片,返回原图 } // 获取图片格式(png/jpg等) String format = getImageFormat(imageBytes); if (format == null) { format = "jpeg"; } ByteArrayOutputStream baos = new ByteArrayOutputStream(); // 质量压缩,尺寸不变,清晰度无损,体积减小 ImageIO.write(bufferedImage, format, new MemoryCacheImageOutputStream(baos) { @Override public void write(byte[] b, int off, int len) { try { super.write(b, off, len); } catch (Exception e) { // 异常时直接写入原图,不影响 } } }); // 如果压缩后体积变大,返回原图 byte[] compressedBytes = baos.toByteArray(); bais.close(); baos.close(); return compressedBytes.length < imageBytes.length ? compressedBytes : imageBytes; } /** * 获取图片真实格式 */ private static String getImageFormat(byte[] imageBytes) throws Exception { ByteArrayInputStream bais = new ByteArrayInputStream(imageBytes); ImageInputStream iis = ImageIO.createImageInputStream(bais); Iterator<ImageReader> readers = ImageIO.getImageReaders(iis); if (readers.hasNext()) { ImageReader reader = readers.next(); String format = reader.getFormatName(); iis.close(); bais.close(); return format; } iis.close(); bais.close(); return null; } public static String embedResources(String html, Map<String, String> resources) { String embeddedHtml = html; // 遍历所有资源,替换原URL为Base64编码 for (Map.Entry<String, String> entry : resources.entrySet()) { String resourceUrl = entry.getKey(); String resourceUrlEscape = resourceUrl.replace("&", "&"); String embeddedUrl = entry.getValue(); embeddedHtml = embeddedHtml.replace(resourceUrlEscape, embeddedUrl); } return embeddedHtml; } public static void saveAsMhtml(String html, String filePath) { try (BufferedWriter writer = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(filePath), StandardCharsets.UTF_8) )) { // 写入MHTML标准协议头 writer.write("MIME-Version: 1.0"); writer.newLine(); writer.write("Content-Type: multipart/related; boundary=\"boundary\""); writer.newLine(); writer.newLine(); // 写入内容边界开始标识 writer.write("--boundary"); writer.newLine(); writer.write("Content-Type: text/html; charset=UTF-8"); writer.newLine(); writer.newLine(); // 写入核心的、已嵌入所有资源的HTML内容 writer.write(html); writer.newLine(); writer.newLine(); // 写入MHTML结束边界标识(必须写,否则文件格式不完整) writer.write("--boundary--"); writer.flush(); }catch (IOException e){ log.error("保存MHTML文件失败:" + filePath, e); } }逻辑调用:
- 通过url和cookie免密获取html字符串
- 获取html中的图片、CSS、JS转成base64的字符串,因为.mhtml文件中超链接类型的样式无法渲染
- 删除html中不需要的布局和内容
- 使用2. 中获取的图片、CSS、JS转成base64的字符串 替换html字符串中的超链接
- 保存为.mhtml文件
String html = WikiUtils.getConfluencePageHtml(link, cookie); if (html.isEmpty()){ log.error("获取html页面失败"); return; } Map<String, String> htmlMap = Html2MHTCompiler.parseHtmlPage(cookie, html, properties.baseURL); String tittle = Html2MHTCompiler.parseTittle(html); String html2 = Html2MHTCompiler.removeUnwantedElements(html); String parseHtml = Html2MHTCompiler.embedResources(html2, htmlMap); Html2MHTCompiler.saveAsMhtml(parseHtml, currentDir+File.separator + tittle + ".mhtml");