java解析搜狗词库scel文件到txt
SougouScelReader 读取词库文件类
import java.io.*; import java.net.URL; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; /*** 读取搜索词库** @author dengjh* @create 2016-11-03 9:39**/ public class SougouScelReader {public SougouScelMdel read(File file) throws IOException {return read(new FileInputStream(file));}public SougouScelMdel read(URL url) throws IOException {return read(url.openStream());}protected ByteArrayOutputStream output=new ByteArrayOutputStream();protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {int read=reads[0];input.skip(pos-read);read=pos;output.reset();while(true) {int c1 = input.read();int c2 = input.read();read+=2;if(c1==0 && c2==0) {break;} else {output.write(c1);output.write(c2);}}reads[0]=read;return new String(output.toByteArray(),encoding);}protected static String encoding = "UTF-16LE";public SougouScelMdel read(InputStream in) throws IOException {SougouScelMdel model = new SougouScelMdel();DataInputStream input = new DataInputStream(in);int read;try {byte[] bytes = new byte[4];input.readFully(bytes);assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);input.readFully(bytes);int flag1 = bytes[0];assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);int[] reads=new int[]{8};model.setName(readString(input,0x130,reads));model.setType(readString(input,0x338,reads));model.setDescription(readString(input,0x540,reads));model.setSample(readString(input,0xd40,reads));read = reads[0];input.skip(0x1540 - read);read=0x1540;input.readFully(bytes);read += 4;assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);bytes = new byte[128];Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();while (true) {int mark = readUnsignedShort(input);int size = input.readUnsignedByte();input.skip(1);read += 4;assert (size > 0 && (size % 2) == 0);input.readFully(bytes, 0, size);read += size;String py = new String(bytes, 0, size, encoding);//System.out.println(py);pyMap.put(mark, py);if ("zuo".equals(py)) {break;}}if (flag1 == 0x44) {input.skip(0x2628 - read);} else if (flag1 == 0x45) {input.skip(0x26C4 - read);} else {throw new RuntimeException("出现意外,联系作者");}StringBuffer buffer = new StringBuffer();Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();while (true) {int size = readUnsignedShort(input);if (size < 0) {break;}int count = readUnsignedShort(input);int len = count / 2;assert (len * 2 == count);buffer.setLength(0);for (int i = 0; i < len; i++) {int key = readUnsignedShort(input);buffer.append(pyMap.get(key)).append("'");}buffer.setLength(buffer.length() - 1);String py = buffer.toString();List<String> list = wordMap.get(py);if (list == null) {list = new ArrayList<String>();wordMap.put(py, list);}for (int i = 0; i < size; i++) {count = readUnsignedShort(input);if (count > bytes.length) {bytes = new byte[count];}input.readFully(bytes, 0, count);String word = new String(bytes, 0, count, encoding);//接下来12个字节可能是词频或者类似信息input.skip(12);list.add(word);}}//System.out.println(wordMap.size());model.setWordMap(wordMap);return model;} finally {in.close();}}protected final int readUnsignedShort(InputStream in) throws IOException {int ch1 = in.read();int ch2 = in.read();if ((ch1 | ch2) < 0) {return Integer.MIN_VALUE;}return (ch2 << 8) + (ch1 << 0);} }
SougouScelMdel.java
import java.util.List; import java.util.Map;/*** @author dengjh* @create 2016-11-03 9:40**/ public class SougouScelMdel {private Map<String, List<String>> wordMap;private String name;private String type;private String description;private String sample;public Map<String, List<String>> getWordMap() {return wordMap;}void setWordMap(Map<String, List<String>> wordMap) {this.wordMap = wordMap;}public String getType() {return type;}public void setType(String type) {this.type = type;}public String getDescription() {return description;}public void setDescription(String description) {this.description = description;}public String getSample() {return sample;}public void setSample(String sample) {this.sample = sample;}public String getName() {return name;}public void setName(String name) {this.name = name;}}
ParseSogo.java 解析词库文件类
/*** 解析搜狗词库文件** @author dengjh* @create 2016-11-03 9:44**/ import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.file.Files; import java.nio.file.LinkOption; import java.nio.file.Paths; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.Map.Entry; public class ParseSogo {public static void main(String[] args)throws Exception {sogou("D:\\scel\\goods.scel","D:\\scel\\goods.txt",true);}/*** 读取scel的词库文件* 生成txt格式的文件* @param inputPath 输入路径* @param outputPath 输出路径* @param isAppend 是否拼接追加词库内容 true 代表追加,false代表重建** **/private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{File file=new File(inputPath);if(!isAppend){if(Files.exists(Paths.get(outputPath),LinkOption.values())){System.out.println("存储此文件已经删除");Files.deleteIfExists(Paths.get(outputPath));}}RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");int count=0;SougouScelMdel model = new SougouScelReader().read(file);Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>Set<Entry<String,List<String>>> set = words.entrySet();Iterator<Entry<String,List<String>>> iter = set.iterator();while(iter.hasNext()){Entry<String,List<String>> entry = iter.next();List<String> list = entry.getValue();int size = list.size();for(int i = 0; i < size; i++){String word = list.get(i);//System.out.println(word);raf.seek(raf.getFilePointer());raf.write((word+"\n").getBytes());//写入txt文件count++;}}raf.close();System.out.println("生成txt成功!,总计写入: "+count+" 条数据!");}}
java解析搜狗词库scel文件到txt相关推荐
- 爬取词库,使用jieba分词库,自定义dict.txt文件+将搜狗词库.scel文件为.txt文件
一:爬取词库,使用jieba分词库,自定义dict.txt文件 import jiebafrom urllib.request import urlopen from bs4 import Beaut ...
- java scel_使用java将搜狗词库.scel文件转化为.txt文件
需求:批量将.scel文件转化为可视的txt文件(支持1对1,多对1,多对多),并从中提取中文词(去重),支持追加内容. 成果: 使用: package com.hxl.files; import j ...
- 搜狗词库scel格式转为txt格式(python3版本)
1.想用搜狗的词库来辅助jieba分词,需要把词库从scel转成txt格式. 在网上找到了大神的python2版本,https://blog.csdn.net/zhangzhenhu/article/ ...
- python读取文本两个数字的成语_只要2步!将搜狗词库(scel)转为Python可读的文本...
该楼层疑似违规已被系统折叠 隐藏此楼查看此楼 将搜狗词库(scel)转化为python可读的文本(text)的方法方法 1. 利用R语言(方法简单) ① 载入词库(R语言) library(Rword ...
- 将搜狗词库.scel格式转化为.txt格式
[2020年5月28日更新:有一说一,这篇文章是我2017年底在新浪工作时处理家居.房产频道相关业务时的实践,代码是后来从自己代码库直接粘贴的,当然转码部分的代码是借鉴的,当时也是查阅了几种方法,一一 ...
- java通过搜狗词库过滤指定词性,JAVA通过搜狗词库过滤指定词性
http://www.0x32.cn/html/y2010/563.html 在测试过程中需要从文本中拿到指定词性的词,比如名词或者动词,各种词性的定义我们可以依靠搜狗的语料库来实现,从搜狗实验室下载 ...
- 解析搜狗词库(python)
#!/usr/bin/python # -*- coding: utf-8 -*-import struct import sys import binascii import pdb #搜狗的sce ...
- python词库_解析搜狗词库(python)
#!/usr/bin/python # -*- coding: utf-8 -*- import struct import sys import binascii import pdb #搜狗的sc ...
- 搜狗词库的批量下载#Python
在制作电子病历全文索引时,需要建立索引,索引是根据索引词建立的,现有索引词匮乏,不能满足需求,搜寻之后,发现搜狗输入法的医学词库很庞大,所以,想着自学写一个Python脚本,完成词库的自动下载工作. ...
- Scrapy 搜狗词库爬虫
引言 最近在学习Python爬虫,这里推荐一个入门爬虫的博客系列 https://github.com/Ehco1996/Python-crawler 博主写的对新手很友好,很适合入门. 我写这篇文章 ...
最新文章
- Linux内核Makefile
- JMeter3.0 post参数/BeanShell中文乱码问题
- FFmpeg4.1编译:mac+android-ndk-14b+ffmpeg4.1成功编译
- mysql varchar最多可以存多少汉字_MySql的这几个坑你踩过没?真是防不胜防!
- 动态壁纸安卓_安卓 高清 动态 壁纸
- 黄聪: 50 个 Bootstrap 插件
- 修复虚拟磁盘LVM表
- QT+VS中ui不能声明为指针?
- [转]Basic OCR in OpenCV
- GALGAME 剧本提取工具
- Docker-Cgroup 资源配置方法
- 傅里叶分析——傅里叶级数
- 电商十二、pinyougou02.sql的内容③
- 移动支付的方式有哪些拾方易告诉你
- java快捷键格式化_在Java中Format的快捷键是什么?
- 用App Designer 制作2048小游戏
- 【Vue.js】Vue.js中常用的UI组件库和Vue Router
- 阜阳睿趣机器人编程_编程教育中心怎么样睿趣疯狂机器人_睿诚教育蒸蒸日上...
- Cisco VTP配置
- 关于单片机替代PLC的思考
热门文章
- abb变频器580系列改中文_ABBACS580一01变频器选择使用语言错误后怎么办?
- 上海财经应用统计考python_2021年上海财经大学应用统计硕士考研必看成功上岸前辈复习经验分享...
- 对python生成的EXE文件 进行反编译
- Kettle连接mysql数据库所需驱动包,出现报错情况(附驱动下载方法)
- input输入框[type=file]上传图片文件转base64数据
- OnlineDict:Chrome取词翻译扩展
- 编写GOM引擎登录器,直接启动GOM客户端DAT文件不掉线
- 如何干净完整卸载office2010
- 解决AD不能导入CAD文件
- Pandas速查手册中文版API