mht 转换 html java,使用java将网页保存为mht格式(2)
//设置网页正文
MimeBodyPart bp = new MimeBodyPart();
bp.setText(content, strEncoding);
bp.addHeader("Content-Type", "text/html;charset=" + strEncoding);
bp.addHeader("Content-Location", strWeb.toString());
mp.addBodyPart(bp);
int urlCount = urlScriptList.size();
for (int i = 0; i < urlCount; i++) {
bp = new MimeBodyPart();
ArrayList urlInfo = (ArrayList) urlScriptList.get(i);
// String url = urlInfo.get(0).toString();
String absoluteURL = urlInfo.get(1).toString();
bp
.addHeader("Content-Location",
javax.mail.internet.MimeUtility
.encodeWord(java.net.URLDecoder
.decode(absoluteURL, strEncoding)));
DataSource source = new AttachmentDataSource(absoluteURL, "text");
bp.setDataHandler(new DataHandler(source));
mp.addBodyPart(bp);
}
urlCount = urlImageList.size();
for (int i = 0; i < urlCount; i++) {
bp = new MimeBodyPart();
ArrayList urlInfo = (ArrayList) urlImageList.get(i);
// String url = urlInfo.get(0).toString();
String absoluteURL = urlInfo.get(1).toString();
bp
.addHeader("Content-Location",
javax.mail.internet.MimeUtility
.encodeWord(java.net.URLDecoder
.decode(absoluteURL, strEncoding)));
DataSource source = new AttachmentDataSource(absoluteURL, "image");
bp.setDataHandler(new DataHandler(source));
mp.addBodyPart(bp);
}
msg.setContent(mp);
// write the mime multi part message to a file
msg.writeTo(new FileOutputStream(strFileName));
}
/**
*方法说明:mht转html
*输入参数:strMht mht文件路径; strHtml html文件路径
*返回类型:
*/
public static void mht2html(String strMht, String strHtml) {
try {
//TODO readEmlFile
InputStream fis = new FileInputStream(strMht);
Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
MimeMessage msg = new MimeMessage(mailSession, fis);
Object content = msg.getContent();
if (content instanceof Multipart) {
MimeMultipart mp = (MimeMultipart)content;
MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
String strEncodng = getEncoding(bp1);
String strText = getHtmlText(bp1, strEncodng);
if (strText == null)
return;
File parent = null;
if (mp.getCount() > 1) {
parent = new File(new File(strHtml).getAbsolutePath() + ".files");
parent.mkdirs();
if (!parent.exists())
return;
}
for (int i = 1; i < mp.getCount(); ++i) {
MimeBodyPart bp = (MimeBodyPart)mp.getBodyPart(i);
String strUrl = getResourcesUrl(bp);
if (strUrl == null)
continue;
DataHandler dataHandler = bp.getDataHandler();
MimePartDataSource source = (MimePartDataSource)dataHandler.getDataSource();
File resources = new File(parent.getAbsolutePath() + File.separator + getName(strUrl, i));
if (saveResourcesFile(resources, bp.getInputStream()))
strText = JHtmlClear.replace(strText, strUrl, resources.getAbsolutePath());
}
saveHtml(strText, strHtml);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
*方法说明:得到资源文件的name
*输入参数:strName 资源文件链接, ID 资源文件的序号
*返回类型:资源文件的本地临时文件名
*/
public static String getName(String strName, int ID) {
char separator = ’/’;
System.out.println(strName);
System.out.println(separator);
if( strName.lastIndexOf(separator) >= 0)
return format(strName.substring(strName.lastIndexOf(separator) + 1));
return "temp" + ID;
}
/**
*方法说明:得到网页编码
*输入参数:bp MimeBodyPart类型的网页内容
*返回类型:MimeBodyPart里的网页内容的编码
*/
private static String getEncoding(MimeBodyPart bp) {
if (bp != null) {
try {
Enumeration list = bp.getAllHeaders();
while (list.hasMoreElements()) {
javax.mail.Header head = (javax.mail.Header)list.nextElement();
if (head.getName().compareTo("Content-Type") == 0) {
String strType = head.getValue();
int pos = strType.indexOf("charset=");
if (pos != -1) {
String strEncoding = strType.substring(pos + 8, strType.length());
if (strEncoding.toLowerCase().compareTo("gb2312") == 0) {
strEncoding = "gbk";
}
return strEncoding;
}
}
}
} catch (MessagingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return null;
}
/**
*方法说明:得到资源文件url
*输入参数:bp MimeBodyPart类型的网页内容
*返回类型:资源文件url
*/ private static String getResourcesUrl(MimeBodyPart bp) {
if (bp != null) {
try {
Enumeration list = bp.getAllHeaders();
while (list.hasMoreElements()) {
javax.mail.Header head = (javax.mail.Header)list.nextElement();
if (head.getName().compareTo("Content-Location") == 0) {
return head.getValue();
}
}
} catch (MessagingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return null;
}
/**
*方法说明:格式化文件名
*输入参数:strName 文件名
*返回类型:经过处理的符合命名规则的文件名
*/
private static String format(String strName) {
if (strName == null)
return null;
strName = strName.replaceAll(" ", " ");
String strText = "/:*?"<>|^___FCKpd___0quot;;
for (int i = 0; i < strName.length(); ++i) {
String ch = String.valueOf(strName.charAt(i));
if (strText.indexOf(ch) != -1) {
strName = strName.replace(strName.charAt(i), ’-’);
}
}
return strName;
}
/**
*方法说明:保存资源文件
*输入参数:resources 要创建的资源文件; inputStream 要输入文件中的流
*返回类型:boolean
*/
private static boolean saveResourcesFile(File resources, InputStream inputStream) {
if (resources == null || inputStream == null) {
return false;
}
BufferedInputStream in = null;
FileOutputStream fio = null;
BufferedOutputStream osw = null;
try {
in = new BufferedInputStream(inputStream);
fio = new FileOutputStream(resources);
osw = new BufferedOutputStream(new DataOutputStream(fio));
int b;
byte[] a = new byte[1024];
boolean isEmpty = true;
while ((b = in.read(a)) != -1) {
isEmpty = false;
osw.write(a, 0, b);
osw.flush();
}
osw.close();
fio.close();
in.close();
inputStream.close();
if (isEmpty)
resources.delete();
return true;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("解析mht失败");
return false;
} finally{
try {
if (osw != null)
osw.close();
if (fio != null)
fio.close();
if (in != null)
in.close();
if (inputStream != null)
inputStream.close();
} catch (Exception e) {
e.printStackTrace();
System.out.println("解析mht失败");
return false;
}
}
}
/**
*方法说明:得到mht文件的标题
*输入参数:mhtFilename mht文件名
*返回类型:mht文件的标题
*/
public static String getTitle(String mhtFilename) {
try {
//TODO readEmlFile
InputStream fis = new FileInputStream(mhtFilename);
Session mailSession = Session.getDefaultInstance(System.getProperties(), null);
MimeMessage msg = new MimeMessage(mailSession, fis);
Object content = msg.getContent();
if (content instanceof Multipart) {
MimeMultipart mp = (MimeMultipart)content;
MimeBodyPart bp1 = (MimeBodyPart)mp.getBodyPart(0);
String strEncodng = getEncoding(bp1);
String strText = getHtmlText(bp1, strEncodng);
if (strText == null)
return null;
strText = strText.toLowerCase();
int pos1 = strText.indexOf("");
int pos2 = strText.indexOf("");
if (pos1 != -1 && pos2!= -1 && pos2 > pos1) {
return strText.substring(pos1 + 7, pos2).trim();
}
}
return null;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
/**
*方法说明:得到html文本
*输入参数:bp MimeBodyPart类型的网页内容; strEncoding 内容编码
*返回类型:html文本
*/ private static String getHtmlText(MimeBodyPart bp, String strEncoding) {
InputStream textStream = null;
BufferedInputStream buff = null;
BufferedReader br = null;
Reader r = null;
try {
textStream = bp.getInputStream();
buff = new BufferedInputStream(textStream);
r = new InputStreamReader(buff, strEncoding);
br = new BufferedReader(r);
StringBuffer strHtml = new StringBuffer("");
String strLine = null;
while ((strLine = br.readLine()) != null) {
strHtml.append(strLine + "rn");
}
br.close();
r.close();
textStream.close();
return strHtml.toString();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally{
try{
if (br != null)
br.close();
if (buff != null)
buff.close();
if (textStream != null)
textStream.close();
}catch(Exception e){
System.out.println("解析mht失败");
}
}
return null;
}
/**
*方法说明:保存html文件
*输入参数:strText html内容; strHtml html文件名
*返回类型:
*/
private static void saveHtml(String strText, String strHtml) {
try {
FileWriter fw = new FileWriter(strHtml);
fw.write(strText);
fw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
System.out.println("解析mht失败");
}
}
private InternetAddress[] getInetAddresses(String emails) throws Exception {
ArrayList list = new ArrayList();
StringTokenizer tok = new StringTokenizer(emails, ",");
while (tok.hasMoreTokens()) {
list.add(tok.nextToken());
}
int count = list.size();
InternetAddress[] addresses = new InternetAddress[count];
for (int i = 0; i < count; i++) {
addresses[i] = new InternetAddress(list.get(i).toString());
}
return addresses;
}
class AttachmentDataSource implements DataSource {
private MimetypesFileTypeMap map = new MimetypesFileTypeMap();
private String strUrl;
private String strType;
private byte[] dataSize = null;
/**
* This is some content type maps.
*/
private Map normalMap = new HashMap();
{
// Initiate normal mime type map
// Images
normalMap.put("image", "image/jpeg");
normalMap.put("text", "text/plain");
}
public AttachmentDataSource(String strUrl, String strType) {
this.strType = strType;
this.strUrl = strUrl;
strUrl = strUrl.trim();
strUrl = strUrl.replaceAll(" ", "%20");
dataSize = JQuery.downBinaryFile(strUrl, null);
}
/**
* Returns the content type.
*/
public String getContentType() {
return getMimeType(getName());
}
public String getName() {
char separator = File.separatorChar;
if( strUrl.lastIndexOf(separator) >= 0 )
return strUrl.substring(strUrl.lastIndexOf(separator) + 1);
return strUrl;
}
private String getMimeType(String fileName) {
String type = (String)normalMap.get(strType);
if (type == null) {
try {
type = map.getContentType(fileName);
} catch (Exception e) {
// TODO: handle exception
}
System.out.println(type);
// Fix the null exception
if (type == null) {
type = "application/octet-stream";
}
}
return type;
}
public InputStream getInputStream() throws IOException {
// TODO Auto-generated method stub
if (dataSize == null)
dataSize = new byte[0];
return new ByteArrayInputStream(dataSize);
}
public OutputStream getOutputStream() throws IOException {
// TODO Auto-generated method stub
return new java.io.ByteArrayOutputStream();
}
}
}
使用java将网页保存为mht格式(2).doc
下载Word文档到电脑,方便收藏和打印[全文共6479字]
编辑推荐:
下载Word文档
mht 转换 html java,使用java将网页保存为mht格式(2)相关推荐
- Java使用Spire将网页保存为PDF并去除Evaluation Warning水印方案
1.下载所需文件(45条消息) Java使用Spire讲网页保存为PDF并去除EvaluationWarning水印方案-Java文档类资源-CSDN文库 2.加载到java项目 3.java例子,P ...
- 将网页保存为mht文件
(*// 标题:将网页保存为mht文件 说明:本地网页还未找到如何将图片打包,有知情者请发邮件告之 设计:Zswang 支持:wjhu111@21cn.com 日期:2004-01-05 //*) u ...
- 用 Microsoft.mshtml.dll 和 WebClient 自己实现网页保存为 MHT 文件
相信大家经常用IE保存网页功能保存有价值的网页,但是IE的网页保存功能做的不是太好,经常会有些页面保存失败.我也深受其烦,好在本人是程序员,程序员最大的好处是会自己编软件.正好我自己开发了个多页签浏览 ...
- 使用谷歌Chrome浏览器将网页保存为html格式
现版本谷歌Chrome浏览器再也没有"Sava Page as MHTML"一说,将网页保存为MHT.MHTML.HTML格式有俩种途径: 1.下载Save As MHT插件,压缩 ...
- 使用selenium将网页保存为MHTML格式
最近在做网页分析的时候偶然有需求将网页保存为MHTML格式,这样可以最大程度的保留网页的样式以及图片信息,搜索寻找整理过后得到如下代码 其中,pagelist.txt文件内容如下: 邮箱,qq邮箱,h ...
- Java使用Spire将网页保存为Word并去除Evaluation Warning水印方案
上代码 @SneakyThrowspublic static void spiceDoc(){Document document = new Document();Section sec = docu ...
- java mat转csv_MATLAB数据保存成其他格式文件(.csv .xlsx .txt等)通用函数[faruto版本]
function [Status, Message] = SaveData2File(Data, FileName, ColNamesCell) %% SaveData2File % by LiYan ...
- 如何将网页保存成mhtml格式
QQ浏览器默认开启. chrome和360极速浏览器(内核chrome) 首先打开浏览器地址栏输入chrome://flags ,会出现搜索框,搜索框里面输入mhtml可以查找Save Page a ...
- 用scrapy+selenium + phantomjs 爬取vip网页,保存为json格式,写入到mysql数据库,下载图片(二)
接上一编 weipin.py文件的代码 : # -*- coding: utf-8 -*- import scrapy from weipinhui.items import WeipinhuiIte ...
最新文章
- Science:一种新型玻璃,有望让手机告别碎屏!
- ubuntu 安装docker_Docker: 教程04 - (初始化安装之在 Ubuntu 安装Docker CE)
- java 添加一个线程、创建响应的用户界面 。 演示示例代码
- 【Python】修改pip默认缓存位置
- Math.signbit()
- 江苏师范大学计算机控制考试题库,江苏技术师范学院2006¬—2007学年第2学期《微机原理与接口技术》试卷(2份,有答案)...
- 第四季-专题16-触摸屏驱动程序设计
- linux系统操作mysql数据库_利用workbench对linux/Ubuntu系统中的mysql数据库进行操作
- AMEsim:车辆动力经济性建模分析三个分享点
- 华为荣耀4X的ROOT
- 沐风:立刻改变你的现状
- 使用tinyxml2库解析xml
- 云豹短视频app源码中,标签选择功能的实现
- 漫话:如何给女朋友解释为什么日本时间比中国时间快一个小时
- 到底什么是响应式布局,响应式布局如何去写
- 【LOJ573】「LibreOJ NOI Round #2」单枪匹马
- [转]Android入门基础教程
- c语言程序电机,直流电机控制C语言程序
- TREC的ad hoc、routing、filtering、topic等术语的解释
- VUE学习一:初识VUE、指令、动态绑定、计算属性