提取爱词霸页面中的自定义信息

/**
	*
	* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
	* yang-shangchuan@qq.com
	*
	* This program is free software: you can redistribute it and/or modify it under
	* the terms of the GNU General Public License as published by the Free Software
	* Foundation, either version 3 of the License, or (at your option) any later
	* version.
	*
	* This program is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
	* details.
	*
	* You should have received a copy of the GNU General Public License along with
	* this program. If not, see <http://www.gnu.org/licenses/>.
	*
	*/

	package org.apdplat.superword.tools;

	import org.apache.commons.lang.StringUtils;
	import org.apdplat.superword.model.Word;
	import org.jsoup.Connection;
	import org.jsoup.Jsoup;
	import org.jsoup.nodes.Document;
	import org.jsoup.nodes.Element;
	import org.jsoup.select.Elements;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import java.io.*;
	import java.nio.file.*;
	import java.nio.file.attribute.BasicFileAttributes;
	import java.util.*;
	import java.util.concurrent.atomic.AtomicInteger;
	import java.util.stream.Collectors;

	/**
	* 利用爱词霸筛选词表中属于各大考试的词
	* 提取爱词霸页面中的自定义信息
	* 考虑到爱词霸的防爬虫限制，特提供包含61821个单词的爱词霸HTML页面origin_html.zip文件供下载
	* 下载地址http://pan.baidu.com/s/1bnD9gy7
	* @author 杨尚川
	*/
	public class WordClassifier {
	private WordClassifier(){}

	private static final Logger LOGGER = LoggerFactory.getLogger(WordClassifier.class);
	private static final String ICIBA = "http://www.iciba.com/";
	private static final String TYPE_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.dictbar div.wd_genre a";
	private static final String UNFOUND_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div#question.question.unfound_tips";
	private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8";
	private static final String ENCODING = "gzip, deflate";
	private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
	private static final String CONNECTION = "keep-alive";
	private static final String HOST = "www.iciba.com";
	private static final String REFERER = "http://www.iciba.com/";
	private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
	private static final Set<String> NOT_FOUND_WORDS = new HashSet<>();
	private static final Set<String> ORIGIN_HTML = new HashSet<>();

	public static void classify(Set<Word> words){
	LOGGER.debug("待处理词数目："+words.size());
	AtomicInteger i = new AtomicInteger();
	Map<String, List<String>> data = new HashMap<>();
	words.forEach(word -> {
	if(i.get()%1000 == 999){
	save(data);
	}
	showStatus(data, i.incrementAndGet(), words.size(), word.getWord());
	String html = getContent(word.getWord());
	//LOGGER.debug("获取到的HTML：" +html);
	while(html.contains("非常抱歉，来自您ip的请求异常频繁")){
	//使用新的IP地址
	DynamicIp.toNewIp();
	html = getContent(word.getWord());
	}
	if(StringUtils.isNotBlank(html)) {
	parse(word.getWord(), html, data);
	if(!NOT_FOUND_WORDS.contains(word.getWord())) {
	ORIGIN_HTML.add(word.getWord() + "杨尚川" + html);
	}
	}else{
	NOT_FOUND_WORDS.add(word.getWord());
	}

	});
	//写入磁盘
	save(data);
	LOGGER.debug("处理完毕，总词数目："+words.size());
	}

	public static void parse(String path){
	if(path.endsWith(".zip")){
	parseZip(path);
	}
	if(Files.isDirectory(Paths.get(path))){
	parseDir(path);
	}else{
	parseFile(path);
	}
	}

	public static void parseDir(String dir) {
	LOGGER.info("开始解析目录：" + dir);
	try {
	Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {

	@Override
	public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
	parseFile(file.toFile().getAbsolutePath());
	return FileVisitResult.CONTINUE;
	}

	});
	} catch (IOException e) {
	LOGGER.error("解析文本出错", e);
	}
	}

	public static void parseZip(String zipFile){
	LOGGER.info("开始解析ZIP文件："+zipFile);
	try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
	for(Path path : fs.getRootDirectories()){
	LOGGER.info("处理目录："+path);
	Files.walkFileTree(path, new SimpleFileVisitor<Path>(){

	@Override
	public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
	LOGGER.info("处理文件："+file);
	// 拷贝到本地文件系统
	Path temp = Paths.get("target/origin-html-temp.txt");
	Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
	parseFile(temp.toFile().getAbsolutePath());
	return FileVisitResult.CONTINUE;
	}

	});
	}
	}catch (Exception e){
	LOGGER.error("解析文本出错", e);
	}
	}

	public static void parseFile(String file){
	LOGGER.info("开始解析文件："+file);
	try (BufferedReader reader = new BufferedReader(
	new InputStreamReader(
	new BufferedInputStream(
	new FileInputStream(file))))) {
	Map<String, List<String>> data = new HashMap<>();
	String line = null;
	while ((line = reader.readLine()) != null) {
	parse(line, data);
	}
	save(data);
	} catch (IOException e) {
	LOGGER.error("解析文本出错", e);
	}
	}
	public static void parse(String html, Map<String, List<String>> data){
	LOGGER.debug("html:"+html);
	String[] attr = html.split("杨尚川");
	if(attr == null \|\| attr.length != 2){
	LOGGER.error("解析文本失败，文本应该以'杨尚川'分割，前面是词，后面是网页，网页内容是去除换行符之后的一整行文本："+html);
	return;
	}
	String word = attr[0];
	LOGGER.info("解析单词："+word);
	String htm = attr[1];
	parse(word, htm, data);
	}

	public static void showStatus(Map<String, List<String>> data, int current, int total, String word){
	LOGGER.debug("开始处理词 "+current+"/"+total+" ，完成进度 "+current/(float)total*100+"% ："+word);
	data.entrySet().forEach(e -> {
	LOGGER.debug(e.getKey()+"\t"+e.getValue().size());
	});
	}

	public static void save(Map<String, List<String>> data){
	LOGGER.info("将数据写入磁盘，防止丢失");
	data.keySet().forEach(key -> {
	try {
	String path = "src/main/resources/word_" + key + ".txt";
	LOGGER.error("保存词典文件：" + path);
	List<String> existWords = Files.readAllLines(Paths.get(path));
	Set<String> allWords = new HashSet<>();
	existWords.forEach(line -> {
	String[] attr = line.split("\\s+");
	if(attr != null) {
	String w = "";
	if(attr.length == 1){
	w = attr[0];
	}
	if(attr.length == 2){
	w = attr[1];
	}
	allWords.add(w);
	}
	});
	allWords.addAll(data.get(key));
	AtomicInteger i = new AtomicInteger();
	List<String> list = allWords
	.stream()
	.sorted()
	.map(word -> i.incrementAndGet()+"\t" + word)
	.collect(Collectors.toList());
	Files.write(Paths.get(path), list);
	data.get(key).clear();
	existWords.clear();
	allWords.clear();
	list.clear();
	}catch (Exception e){
	LOGGER.error("保存词典文件失败", e);
	}
	});
	data.clear();
	try {
	if(!NOT_FOUND_WORDS.isEmpty()) {
	String path = "src/main/resources/word_not_found.txt";
	LOGGER.error("保存词典文件：" + path);
	AtomicInteger i = new AtomicInteger();
	//NOT_FOUND_WORDS比较少，常驻内存
	List<String> list = NOT_FOUND_WORDS
	.stream()
	.sorted()
	.map(word -> i.incrementAndGet() + "\t" + word)
	.collect(Collectors.toList());
	Files.write(Paths.get(path), list);
	list.clear();
	}
	//保存原始HTML
	if(!ORIGIN_HTML.isEmpty()) {
	String path = "src/main/resources/origin_html_" + System.currentTimeMillis() + ".txt";
	LOGGER.error("保存词典文件：" + path);
	Files.write(Paths.get(path), ORIGIN_HTML);
	ORIGIN_HTML.clear();
	}
	}catch (Exception e){
	LOGGER.error("保存词典文件失败", e);
	}
	}

	public static String getContent(String word) {
	String url = ICIBA + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000);
	LOGGER.debug("url:"+url);
	Connection conn = Jsoup.connect(url)
	.header("Accept", ACCEPT)
	.header("Accept-Encoding", ENCODING)
	.header("Accept-Language", LANGUAGE)
	.header("Connection", CONNECTION)
	.header("Referer", REFERER)
	.header("Host", HOST)
	.header("User-Agent", USER_AGENT)
	.ignoreContentType(true);
	String html = "";
	try {
	html = conn.post().html();
	html = html.replaceAll("[\n\r]", "");
	}catch (Exception e){
	LOGGER.error("获取URL："+url+"页面出错", e);
	}
	return html;
	}

	public static void parse(String word, String html, Map<String, List<String>> data){
	Document doc = Jsoup.parse(html);
	Elements es = doc.select(TYPE_CSS_PATH);
	for(Element e : es){
	String type = e.text();
	LOGGER.debug("获取到的类型："+type);
	if(StringUtils.isNotBlank(type)){
	data.putIfAbsent(type, new ArrayList<>());
	data.get(type).add(word);
	}
	}
	es = doc.select(UNFOUND_CSS_PATH);
	for(Element e : es){
	String notFound = e.text();
	LOGGER.debug("没有该词："+notFound);
	if(StringUtils.isNotBlank(notFound)
	&& (notFound.contains("对不起，没有找到")
	\|\| notFound.contains("您要查找的是不是"))){
	NOT_FOUND_WORDS.add(word);
	}
	}
	}

	public static void main(String[] args) {
	//Set<Word> words = new HashSet<>();
	//words.add(new Word("time", ""));
	//words.add(new Word("yangshangchuan", ""));
	//classify(words);
	//classify(WordSources.getAll());
	//parse("src/main/resources/origin_html_1427060576977.txt");
	//origin_html.zip包含61821个单词的爱词霸解析HTML页面，下载地址http://pan.baidu.com/s/1bnD9gy7
	parse("/Users/apple/百度云同步盘/origin_html.zip");
	}
	}

提取爱词霸页面中的自定义信息相关推荐

对爱词霸（iciba）生词本功能的一些建议
工作中经常要用到爱词霸iciba进行生词查询,同时为了让日趋降低的单词量降得慢一些,我使用了其中的生词本功能.然而,有几点缺陷让我很不爽. 没有词义编辑功能没有增加编辑例句最好有一个生词编组的功能 ...
Python也可以很暖男之每日发送爱词霸每日一句+日期+农历+天气预报+各种天气指数（更新）
Python也可以很暖男之每日发送爱词霸每日一句中英文+日期+农历+天气预报+各种天气指数(只学了一个多月新手的第二个实战项目,代码写的不好,请轻喷,谢谢,有很多网友问为何用不了,因为中国天气网改了代 ...
每天叫醒你的不是闹钟，而是“爱词霸每日一句”——Python实现将每日一句定时推送至微信...
前言前几天在网上看到一篇文章<教你用微信每天给女票说晚安>,感觉很神奇的样子,随后研究了一下,构思的确是巧妙.好,那就开始动工吧!服务器有了,Python环境有了,IDE打开了...然而 ...
金山爱词霸系列软件ISO纯净合集
金山爱词霸系列软件ISO合集(集成金山词霸2007.金山快译2007.WPS·爱词霸版) 本ISO合集含: 金山词霸2007.金山快译2007.WPS Office爱词霸版.金山英文写作助理所有 ...
linux 词霸,Linux中的词霸
Linux中的词霸 Linux中的词霸何晓龙2004年5月31日第21期星际译王是Linux中遵循GPL的英汉.汉英字典程序,它具有"规则/模糊查询"."屏幕取词& ...
最新爱词霸 Java + mysql （含源码+数据库）
爱词霸最新爬取 Java + mysql (含源码+数据库) 感觉金山词库的内容相对来说是最完整的!研究了一天,通过爬取html整出来的! 当然只是教程,切勿真实爬取.具体实现为什么这么做,不做阐述 ...
python编写--爱词霸在线翻译软件
原理: 在使用金山PDF的时候发现,里面的取词翻译是使用的iCIBA(爱词霸),而且翻译的非常准确.就尝试了一下在线翻译.网址:http://fy.iciba.com/?from=wps_client ...
【SharePoint】SharePoint自定义页面中使用自定义母版页
前言 SharePoint自定义页面中使用自定义母版页详细步骤 1.新建自定义母版页:把母版页seattle.master复制一份,修改为自己所需的名字,如portalmmcustom.master ...
java错误页面显示错误信息_Struts2在JSP页面中显示错误信息和提示信息的方法
Struts2在JSP页面中显示错误信息和提示信息的方法主要有以下四种. 注意:以下四种方法均需要使Action类继承ActionSupport类. 一.域级错误信息 ①重写Action中的valid ...
embed的名词_embed是什么意思_embed的翻译_音标_读音_用法_例句_爱词霸在线词典
全部把-嵌入 At the bottom of this structure we embed constants into operators which terminate the nestin ...

提取爱词霸页面中的自定义信息

提取爱词霸页面中的自定义信息相关推荐

最新文章

热门文章