提取爱词霸页面中的自定义信息
/** | |
* | |
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, | |
* yang-shangchuan@qq.com | |
* | |
* This program is free software: you can redistribute it and/or modify it under | |
* the terms of the GNU General Public License as published by the Free Software | |
* Foundation, either version 3 of the License, or (at your option) any later | |
* version. | |
* | |
* This program is distributed in the hope that it will be useful, but WITHOUT | |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more | |
* details. | |
* | |
* You should have received a copy of the GNU General Public License along with | |
* this program. If not, see <http://www.gnu.org/licenses/>. | |
* | |
*/ | |
package org.apdplat.superword.tools; | |
import org.apache.commons.lang.StringUtils; | |
import org.apdplat.superword.model.Word; | |
import org.jsoup.Connection; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import org.slf4j.Logger; | |
import org.slf4j.LoggerFactory; | |
import java.io.*; | |
import java.nio.file.*; | |
import java.nio.file.attribute.BasicFileAttributes; | |
import java.util.*; | |
import java.util.concurrent.atomic.AtomicInteger; | |
import java.util.stream.Collectors; | |
/** | |
* 利用爱词霸筛选词表中属于各大考试的词 | |
* 提取爱词霸页面中的自定义信息 | |
* 考虑到爱词霸的防爬虫限制,特提供包含61821个单词的爱词霸HTML页面origin_html.zip文件供下载 | |
* 下载地址http://pan.baidu.com/s/1bnD9gy7 | |
* @author 杨尚川 | |
*/ | |
public class WordClassifier { | |
private WordClassifier(){} | |
private static final Logger LOGGER = LoggerFactory.getLogger(WordClassifier.class); | |
private static final String ICIBA = "http://www.iciba.com/"; | |
private static final String TYPE_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.dictbar div.wd_genre a"; | |
private static final String UNFOUND_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div#question.question.unfound_tips"; | |
private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; | |
private static final String ENCODING = "gzip, deflate"; | |
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; | |
private static final String CONNECTION = "keep-alive"; | |
private static final String HOST = "www.iciba.com"; | |
private static final String REFERER = "http://www.iciba.com/"; | |
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; | |
private static final Set<String> NOT_FOUND_WORDS = new HashSet<>(); | |
private static final Set<String> ORIGIN_HTML = new HashSet<>(); | |
public static void classify(Set<Word> words){ | |
LOGGER.debug("待处理词数目:"+words.size()); | |
AtomicInteger i = new AtomicInteger(); | |
Map<String, List<String>> data = new HashMap<>(); | |
words.forEach(word -> { | |
if(i.get()%1000 == 999){ | |
save(data); | |
} | |
showStatus(data, i.incrementAndGet(), words.size(), word.getWord()); | |
String html = getContent(word.getWord()); | |
//LOGGER.debug("获取到的HTML:" +html); | |
while(html.contains("非常抱歉,来自您ip的请求异常频繁")){ | |
//使用新的IP地址 | |
DynamicIp.toNewIp(); | |
html = getContent(word.getWord()); | |
} | |
if(StringUtils.isNotBlank(html)) { | |
parse(word.getWord(), html, data); | |
if(!NOT_FOUND_WORDS.contains(word.getWord())) { | |
ORIGIN_HTML.add(word.getWord() + "杨尚川" + html); | |
} | |
}else{ | |
NOT_FOUND_WORDS.add(word.getWord()); | |
} | |
}); | |
//写入磁盘 | |
save(data); | |
LOGGER.debug("处理完毕,总词数目:"+words.size()); | |
} | |
public static void parse(String path){ | |
if(path.endsWith(".zip")){ | |
parseZip(path); | |
} | |
if(Files.isDirectory(Paths.get(path))){ | |
parseDir(path); | |
}else{ | |
parseFile(path); | |
} | |
} | |
public static void parseDir(String dir) { | |
LOGGER.info("开始解析目录:" + dir); | |
try { | |
Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { | |
@Override | |
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { | |
parseFile(file.toFile().getAbsolutePath()); | |
return FileVisitResult.CONTINUE; | |
} | |
}); | |
} catch (IOException e) { | |
LOGGER.error("解析文本出错", e); | |
} | |
} | |
public static void parseZip(String zipFile){ | |
LOGGER.info("开始解析ZIP文件:"+zipFile); | |
try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { | |
for(Path path : fs.getRootDirectories()){ | |
LOGGER.info("处理目录:"+path); | |
Files.walkFileTree(path, new SimpleFileVisitor<Path>(){ | |
@Override | |
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { | |
LOGGER.info("处理文件:"+file); | |
// 拷贝到本地文件系统 | |
Path temp = Paths.get("target/origin-html-temp.txt"); | |
Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); | |
parseFile(temp.toFile().getAbsolutePath()); | |
return FileVisitResult.CONTINUE; | |
} | |
}); | |
} | |
}catch (Exception e){ | |
LOGGER.error("解析文本出错", e); | |
} | |
} | |
public static void parseFile(String file){ | |
LOGGER.info("开始解析文件:"+file); | |
try (BufferedReader reader = new BufferedReader( | |
new InputStreamReader( | |
new BufferedInputStream( | |
new FileInputStream(file))))) { | |
Map<String, List<String>> data = new HashMap<>(); | |
String line = null; | |
while ((line = reader.readLine()) != null) { | |
parse(line, data); | |
} | |
save(data); | |
} catch (IOException e) { | |
LOGGER.error("解析文本出错", e); | |
} | |
} | |
public static void parse(String html, Map<String, List<String>> data){ | |
LOGGER.debug("html:"+html); | |
String[] attr = html.split("杨尚川"); | |
if(attr == null || attr.length != 2){ | |
LOGGER.error("解析文本失败,文本应该以'杨尚川'分割,前面是词,后面是网页,网页内容是去除换行符之后的一整行文本:"+html); | |
return; | |
} | |
String word = attr[0]; | |
LOGGER.info("解析单词:"+word); | |
String htm = attr[1]; | |
parse(word, htm, data); | |
} | |
public static void showStatus(Map<String, List<String>> data, int current, int total, String word){ | |
LOGGER.debug("开始处理词 "+current+"/"+total+" ,完成进度 "+current/(float)total*100+"% :"+word); | |
data.entrySet().forEach(e -> { | |
LOGGER.debug(e.getKey()+"\t"+e.getValue().size()); | |
}); | |
} | |
public static void save(Map<String, List<String>> data){ | |
LOGGER.info("将数据写入磁盘,防止丢失"); | |
data.keySet().forEach(key -> { | |
try { | |
String path = "src/main/resources/word_" + key + ".txt"; | |
LOGGER.error("保存词典文件:" + path); | |
List<String> existWords = Files.readAllLines(Paths.get(path)); | |
Set<String> allWords = new HashSet<>(); | |
existWords.forEach(line -> { | |
String[] attr = line.split("\\s+"); | |
if(attr != null) { | |
String w = ""; | |
if(attr.length == 1){ | |
w = attr[0]; | |
} | |
if(attr.length == 2){ | |
w = attr[1]; | |
} | |
allWords.add(w); | |
} | |
}); | |
allWords.addAll(data.get(key)); | |
AtomicInteger i = new AtomicInteger(); | |
List<String> list = allWords | |
.stream() | |
.sorted() | |
.map(word -> i.incrementAndGet()+"\t" + word) | |
.collect(Collectors.toList()); | |
Files.write(Paths.get(path), list); | |
data.get(key).clear(); | |
existWords.clear(); | |
allWords.clear(); | |
list.clear(); | |
}catch (Exception e){ | |
LOGGER.error("保存词典文件失败", e); | |
} | |
}); | |
data.clear(); | |
try { | |
if(!NOT_FOUND_WORDS.isEmpty()) { | |
String path = "src/main/resources/word_not_found.txt"; | |
LOGGER.error("保存词典文件:" + path); | |
AtomicInteger i = new AtomicInteger(); | |
//NOT_FOUND_WORDS比较少,常驻内存 | |
List<String> list = NOT_FOUND_WORDS | |
.stream() | |
.sorted() | |
.map(word -> i.incrementAndGet() + "\t" + word) | |
.collect(Collectors.toList()); | |
Files.write(Paths.get(path), list); | |
list.clear(); | |
} | |
//保存原始HTML | |
if(!ORIGIN_HTML.isEmpty()) { | |
String path = "src/main/resources/origin_html_" + System.currentTimeMillis() + ".txt"; | |
LOGGER.error("保存词典文件:" + path); | |
Files.write(Paths.get(path), ORIGIN_HTML); | |
ORIGIN_HTML.clear(); | |
} | |
}catch (Exception e){ | |
LOGGER.error("保存词典文件失败", e); | |
} | |
} | |
public static String getContent(String word) { | |
String url = ICIBA + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000); | |
LOGGER.debug("url:"+url); | |
Connection conn = Jsoup.connect(url) | |
.header("Accept", ACCEPT) | |
.header("Accept-Encoding", ENCODING) | |
.header("Accept-Language", LANGUAGE) | |
.header("Connection", CONNECTION) | |
.header("Referer", REFERER) | |
.header("Host", HOST) | |
.header("User-Agent", USER_AGENT) | |
.ignoreContentType(true); | |
String html = ""; | |
try { | |
html = conn.post().html(); | |
html = html.replaceAll("[\n\r]", ""); | |
}catch (Exception e){ | |
LOGGER.error("获取URL:"+url+"页面出错", e); | |
} | |
return html; | |
} | |
public static void parse(String word, String html, Map<String, List<String>> data){ | |
Document doc = Jsoup.parse(html); | |
Elements es = doc.select(TYPE_CSS_PATH); | |
for(Element e : es){ | |
String type = e.text(); | |
LOGGER.debug("获取到的类型:"+type); | |
if(StringUtils.isNotBlank(type)){ | |
data.putIfAbsent(type, new ArrayList<>()); | |
data.get(type).add(word); | |
} | |
} | |
es = doc.select(UNFOUND_CSS_PATH); | |
for(Element e : es){ | |
String notFound = e.text(); | |
LOGGER.debug("没有该词:"+notFound); | |
if(StringUtils.isNotBlank(notFound) | |
&& (notFound.contains("对不起,没有找到") | |
|| notFound.contains("您要查找的是不是"))){ | |
NOT_FOUND_WORDS.add(word); | |
} | |
} | |
} | |
public static void main(String[] args) { | |
//Set<Word> words = new HashSet<>(); | |
//words.add(new Word("time", "")); | |
//words.add(new Word("yangshangchuan", "")); | |
//classify(words); | |
//classify(WordSources.getAll()); | |
//parse("src/main/resources/origin_html_1427060576977.txt"); | |
//origin_html.zip包含61821个单词的爱词霸解析HTML页面,下载地址http://pan.baidu.com/s/1bnD9gy7 | |
parse("/Users/apple/百度云同步盘/origin_html.zip"); | |
} | |
} |
提取爱词霸页面中的自定义信息相关推荐
- 对爱词霸(iciba)生词本功能的一些建议
工作中经常要用到爱词霸iciba进行生词查询,同时为了让日趋降低的单词量降得慢一些,我使用了其中的生词本功能.然而,有几点缺陷让我很不爽. 没有词义编辑功能 没有增加编辑例句 最好有一个生词编组的功能 ...
- Python也可以很暖男之每日发送爱词霸每日一句+日期+农历+天气预报+各种天气指数(更新)
Python也可以很暖男之每日发送爱词霸每日一句中英文+日期+农历+天气预报+各种天气指数(只学了一个多月新手的第二个实战项目,代码写的不好,请轻喷,谢谢,有很多网友问为何用不了,因为中国天气网改了代 ...
- 每天叫醒你的不是闹钟,而是“爱词霸每日一句”——Python实现将每日一句定时推送至微信...
前言 前几天在网上看到一篇文章<教你用微信每天给女票说晚安>,感觉很神奇的样子,随后研究了一下,构思的确是巧妙.好,那就开始动工吧!服务器有了,Python环境有了,IDE打开了...然而 ...
- 金山爱词霸系列软件ISO纯净合集
金山爱词霸系列软件ISO合集(集成金山词霸2007.金山快译2007.WPS·爱词霸版) 本ISO合集含: 金山词霸2007.金山快译2007.WPS Office爱词霸版.金山英文写作助理 所有 ...
- linux 词霸,Linux中的词霸
Linux中的词霸 Linux中的词霸 何晓龙2004年5月31日 第21期 星际译王是Linux中遵循GPL的英汉.汉英字典程序,它具有"规则/模糊查询"."屏幕取词& ...
- 最新爱词霸 Java + mysql (含源码+数据库)
爱词霸 最新爬取 Java + mysql (含源码+数据库) 感觉金山词库的内容相对来说是最完整的!研究了一天,通过爬取html整出来的! 当然只是教程,切勿真实爬取.具体实现为什么这么做,不做阐述 ...
- python编写--爱词霸在线翻译软件
原理: 在使用金山PDF的时候发现,里面的取词翻译是使用的iCIBA(爱词霸),而且翻译的非常准确.就尝试了一下在线翻译.网址:http://fy.iciba.com/?from=wps_client ...
- 【SharePoint】SharePoint自定义页面中使用自定义母版页
前言 SharePoint自定义页面中使用自定义母版页 详细步骤 1.新建自定义母版页:把母版页seattle.master复制一份,修改为自己所需的名字,如portalmmcustom.master ...
- java错误页面显示错误信息_Struts2在JSP页面中显示错误信息和提示信息的方法
Struts2在JSP页面中显示错误信息和提示信息的方法主要有以下四种. 注意:以下四种方法均需要使Action类继承ActionSupport类. 一.域级错误信息 ①重写Action中的valid ...
- embed的名词_embed是什么意思_embed的翻译_音标_读音_用法_例句_爱词霸在线词典
全部 把-嵌入 At the bottom of this structure we embed constants into operators which terminate the nestin ...
最新文章
- could not export python function call python_value. Remove calls to Python functions before export
- 液冷数据中心再获加持,北京数据中心PUE要低至1.118
- 嵌入式系统开发入门一:必备基础知识
- 调整了canvas的高度页面变化后还原_Python GUI编程入门(25)-移动Canvas对象
- Qt Creator寻找
- eclipse连接hdfs操作设置用户名
- 逆向工程核心原理学习笔记(七):总结
- [TJOI2010]阅读理解
- java中的jgroup_JGroups实现聊天小程序
- mysql show的用法
- 网易游戏开发工程师笔试题
- Unity3d发布WebPlayer版本遇到的问题的解决方法
- idea编译android项目,IDEA创建Android项目并反编译APK
- wpf OpenClipBoard CLIPBRD_E_CANT_OPEN
- 计算机查看图片的打开方式,windows10电脑怎么在右键菜单打开方式添加照片查看器...
- Quartus II cyclone 系列fpga程序下载到flash中
- 因为没有使用有效的安全证书进行签名,该内容已被屏蔽。
- JAVA_抽象类和接口
- 回归常态啦 2020.12.08日记
- 安卓手机安装运行谷歌三件套、googleplay收集表调查报告
热门文章
- C++异常机制的实现方式和开销分析
- admin后台管理系统
- 【自然语言处理】【聚类】ECIC:通过迭代分类增强短文本聚类
- 飞鱼星测试软件,飞鱼星VE984GW+
- win7家庭普通版升级旗舰版 密钥
- 统计局:2018年全国规模以上工业企业利润增长10.3%
- SU战队专访:破而后立,晓喻新生
- 2021牛客暑期多校训练营10 F.Train Wreck(栈,并查集,优先队列,贪心)
- 在VMware虚拟机环境下为msdos7.1安装masm
- Gyro陀螺仪 > MPU 6000 vs ICM 20689