SougouScelReader 读取词库文件类

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/*** 读取搜索词库** @author dengjh* @create 2016-11-03 9:39**/
public class SougouScelReader {public SougouScelMdel read(File file) throws IOException {return read(new FileInputStream(file));}public SougouScelMdel read(URL url) throws IOException {return read(url.openStream());}protected ByteArrayOutputStream output=new ByteArrayOutputStream();protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {int read=reads[0];input.skip(pos-read);read=pos;output.reset();while(true) {int c1 = input.read();int c2 = input.read();read+=2;if(c1==0 && c2==0) {break;} else {output.write(c1);output.write(c2);}}reads[0]=read;return new String(output.toByteArray(),encoding);}protected static String encoding = "UTF-16LE";public SougouScelMdel read(InputStream in) throws IOException {SougouScelMdel model = new SougouScelMdel();DataInputStream input = new DataInputStream(in);int read;try {byte[] bytes = new byte[4];input.readFully(bytes);assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);input.readFully(bytes);int flag1 = bytes[0];assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);int[] reads=new int[]{8};model.setName(readString(input,0x130,reads));model.setType(readString(input,0x338,reads));model.setDescription(readString(input,0x540,reads));model.setSample(readString(input,0xd40,reads));read = reads[0];input.skip(0x1540 - read);read=0x1540;input.readFully(bytes);read += 4;assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);bytes = new byte[128];Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();while (true) {int mark = readUnsignedShort(input);int size = input.readUnsignedByte();input.skip(1);read += 4;assert (size > 0 && (size % 2) == 0);input.readFully(bytes, 0, size);read += size;String py = new String(bytes, 0, size, encoding);//System.out.println(py);pyMap.put(mark, py);if ("zuo".equals(py)) {break;}}if (flag1 == 0x44) {input.skip(0x2628 - read);} else if (flag1 == 0x45) {input.skip(0x26C4 - read);} else {throw new RuntimeException("出现意外,联系作者");}StringBuffer buffer = new StringBuffer();Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();while (true) {int size = readUnsignedShort(input);if (size < 0) {break;}int count = readUnsignedShort(input);int len = count / 2;assert (len * 2 == count);buffer.setLength(0);for (int i = 0; i < len; i++) {int key = readUnsignedShort(input);buffer.append(pyMap.get(key)).append("'");}buffer.setLength(buffer.length() - 1);String py = buffer.toString();List<String> list = wordMap.get(py);if (list == null) {list = new ArrayList<String>();wordMap.put(py, list);}for (int i = 0; i < size; i++) {count = readUnsignedShort(input);if (count > bytes.length) {bytes = new byte[count];}input.readFully(bytes, 0, count);String word = new String(bytes, 0, count, encoding);//接下来12个字节可能是词频或者类似信息input.skip(12);list.add(word);}}//System.out.println(wordMap.size());model.setWordMap(wordMap);return model;} finally {in.close();}}protected final int readUnsignedShort(InputStream in) throws IOException {int ch1 = in.read();int ch2 = in.read();if ((ch1 | ch2) < 0) {return Integer.MIN_VALUE;}return (ch2 << 8) + (ch1 << 0);}
}

SougouScelMdel.java

import java.util.List;
import java.util.Map;/*** @author dengjh* @create 2016-11-03 9:40**/
public class SougouScelMdel {private Map<String, List<String>> wordMap;private String name;private String type;private String description;private String sample;public Map<String, List<String>> getWordMap() {return wordMap;}void setWordMap(Map<String, List<String>> wordMap) {this.wordMap = wordMap;}public String getType() {return type;}public void setType(String type) {this.type = type;}public String getDescription() {return description;}public void setDescription(String description) {this.description = description;}public String getSample() {return sample;}public void setSample(String sample) {this.sample = sample;}public String getName() {return name;}public void setName(String name) {this.name = name;}}

ParseSogo.java 解析词库文件类

/*** 解析搜狗词库文件** @author dengjh* @create 2016-11-03 9:44**/
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
public class ParseSogo {public static void main(String[] args)throws Exception {sogou("D:\\scel\\goods.scel","D:\\scel\\goods.txt",true);}/*** 读取scel的词库文件* 生成txt格式的文件* @param inputPath 输入路径* @param outputPath 输出路径* @param isAppend  是否拼接追加词库内容 true 代表追加,false代表重建** **/private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{File file=new File(inputPath);if(!isAppend){if(Files.exists(Paths.get(outputPath),LinkOption.values())){System.out.println("存储此文件已经删除");Files.deleteIfExists(Paths.get(outputPath));}}RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");int count=0;SougouScelMdel model = new SougouScelReader().read(file);Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>Set<Entry<String,List<String>>> set = words.entrySet();Iterator<Entry<String,List<String>>> iter = set.iterator();while(iter.hasNext()){Entry<String,List<String>> entry = iter.next();List<String> list = entry.getValue();int size = list.size();for(int i = 0; i < size; i++){String word = list.get(i);//System.out.println(word);raf.seek(raf.getFilePointer());raf.write((word+"\n").getBytes());//写入txt文件count++;}}raf.close();System.out.println("生成txt成功!,总计写入: "+count+" 条数据!");}}

java解析搜狗词库scel文件到txt相关推荐

  1. 爬取词库,使用jieba分词库,自定义dict.txt文件+将搜狗词库.scel文件为.txt文件

    一:爬取词库,使用jieba分词库,自定义dict.txt文件 import jiebafrom urllib.request import urlopen from bs4 import Beaut ...

  2. java scel_使用java将搜狗词库.scel文件转化为.txt文件

    需求:批量将.scel文件转化为可视的txt文件(支持1对1,多对1,多对多),并从中提取中文词(去重),支持追加内容. 成果: 使用: package com.hxl.files; import j ...

  3. 搜狗词库scel格式转为txt格式(python3版本)

    1.想用搜狗的词库来辅助jieba分词,需要把词库从scel转成txt格式. 在网上找到了大神的python2版本,https://blog.csdn.net/zhangzhenhu/article/ ...

  4. python读取文本两个数字的成语_只要2步!将搜狗词库(scel)转为Python可读的文本...

    该楼层疑似违规已被系统折叠 隐藏此楼查看此楼 将搜狗词库(scel)转化为python可读的文本(text)的方法方法 1. 利用R语言(方法简单) ① 载入词库(R语言) library(Rword ...

  5. 将搜狗词库.scel格式转化为.txt格式

    [2020年5月28日更新:有一说一,这篇文章是我2017年底在新浪工作时处理家居.房产频道相关业务时的实践,代码是后来从自己代码库直接粘贴的,当然转码部分的代码是借鉴的,当时也是查阅了几种方法,一一 ...

  6. java通过搜狗词库过滤指定词性,JAVA通过搜狗词库过滤指定词性

    http://www.0x32.cn/html/y2010/563.html 在测试过程中需要从文本中拿到指定词性的词,比如名词或者动词,各种词性的定义我们可以依靠搜狗的语料库来实现,从搜狗实验室下载 ...

  7. 解析搜狗词库(python)

    #!/usr/bin/python # -*- coding: utf-8 -*-import struct import sys import binascii import pdb #搜狗的sce ...

  8. python词库_解析搜狗词库(python)

    #!/usr/bin/python # -*- coding: utf-8 -*- import struct import sys import binascii import pdb #搜狗的sc ...

  9. 搜狗词库的批量下载#Python

    在制作电子病历全文索引时,需要建立索引,索引是根据索引词建立的,现有索引词匮乏,不能满足需求,搜寻之后,发现搜狗输入法的医学词库很庞大,所以,想着自学写一个Python脚本,完成词库的自动下载工作. ...

  10. Scrapy 搜狗词库爬虫

    引言 最近在学习Python爬虫,这里推荐一个入门爬虫的博客系列 https://github.com/Ehco1996/Python-crawler 博主写的对新手很友好,很适合入门. 我写这篇文章 ...

最新文章

  1. Linux内核Makefile
  2. JMeter3.0 post参数/BeanShell中文乱码问题
  3. FFmpeg4.1编译:mac+android-ndk-14b+ffmpeg4.1成功编译
  4. mysql varchar最多可以存多少汉字_MySql的这几个坑你踩过没?真是防不胜防!
  5. 动态壁纸安卓_安卓 高清 动态 壁纸
  6. 黄聪: 50 个 Bootstrap 插件
  7. 修复虚拟磁盘LVM表
  8. QT+VS中ui不能声明为指针?
  9. [转]Basic OCR in OpenCV
  10. GALGAME 剧本提取工具
  11. Docker-Cgroup 资源配置方法
  12. 傅里叶分析——傅里叶级数
  13. 电商十二、pinyougou02.sql的内容③
  14. 移动支付的方式有哪些拾方易告诉你
  15. java快捷键格式化_在Java中Format的快捷键是什么?
  16. 用App Designer 制作2048小游戏
  17. 【Vue.js】Vue.js中常用的UI组件库和Vue Router
  18. 阜阳睿趣机器人编程_编程教育中心怎么样睿趣疯狂机器人_睿诚教育蒸蒸日上...
  19. Cisco VTP配置
  20. 关于单片机替代PLC的思考

热门文章

  1. abb变频器580系列改中文_ABBACS580一01变频器选择使用语言错误后怎么办?
  2. 上海财经应用统计考python_2021年上海财经大学应用统计硕士考研必看成功上岸前辈复习经验分享...
  3. 对python生成的EXE文件 进行反编译
  4. Kettle连接mysql数据库所需驱动包,出现报错情况(附驱动下载方法)
  5. input输入框[type=file]上传图片文件转base64数据
  6. OnlineDict:Chrome取词翻译扩展
  7. 编写GOM引擎登录器,直接启动GOM客户端DAT文件不掉线
  8. 如何干净完整卸载office2010
  9. 解决AD不能导入CAD文件
  10. Pandas速查手册中文版API