java解析搜狗词库scel文件到txt

SougouScelReader 读取词库文件类

import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
/*** 读取搜索词库** @author dengjh* @create 2016-11-03 9:39**/
public class SougouScelReader {public SougouScelMdel read(File file) throws IOException {return read(new FileInputStream(file));}public SougouScelMdel read(URL url) throws IOException {return read(url.openStream());}protected ByteArrayOutputStream output=new ByteArrayOutputStream();protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {int read=reads[0];input.skip(pos-read);read=pos;output.reset();while(true) {int c1 = input.read();int c2 = input.read();read+=2;if(c1==0 && c2==0) {break;} else {output.write(c1);output.write(c2);}}reads[0]=read;return new String(output.toByteArray(),encoding);}protected static String encoding = "UTF-16LE";public SougouScelMdel read(InputStream in) throws IOException {SougouScelMdel model = new SougouScelMdel();DataInputStream input = new DataInputStream(in);int read;try {byte[] bytes = new byte[4];input.readFully(bytes);assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);input.readFully(bytes);int flag1 = bytes[0];assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);int[] reads=new int[]{8};model.setName(readString(input,0x130,reads));model.setType(readString(input,0x338,reads));model.setDescription(readString(input,0x540,reads));model.setSample(readString(input,0xd40,reads));read = reads[0];input.skip(0x1540 - read);read=0x1540;input.readFully(bytes);read += 4;assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);bytes = new byte[128];Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();while (true) {int mark = readUnsignedShort(input);int size = input.readUnsignedByte();input.skip(1);read += 4;assert (size > 0 && (size % 2) == 0);input.readFully(bytes, 0, size);read += size;String py = new String(bytes, 0, size, encoding);//System.out.println(py);pyMap.put(mark, py);if ("zuo".equals(py)) {break;}}if (flag1 == 0x44) {input.skip(0x2628 - read);} else if (flag1 == 0x45) {input.skip(0x26C4 - read);} else {throw new RuntimeException("出现意外，联系作者");}StringBuffer buffer = new StringBuffer();Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();while (true) {int size = readUnsignedShort(input);if (size < 0) {break;}int count = readUnsignedShort(input);int len = count / 2;assert (len * 2 == count);buffer.setLength(0);for (int i = 0; i < len; i++) {int key = readUnsignedShort(input);buffer.append(pyMap.get(key)).append("'");}buffer.setLength(buffer.length() - 1);String py = buffer.toString();List<String> list = wordMap.get(py);if (list == null) {list = new ArrayList<String>();wordMap.put(py, list);}for (int i = 0; i < size; i++) {count = readUnsignedShort(input);if (count > bytes.length) {bytes = new byte[count];}input.readFully(bytes, 0, count);String word = new String(bytes, 0, count, encoding);//接下来12个字节可能是词频或者类似信息input.skip(12);list.add(word);}}//System.out.println(wordMap.size());model.setWordMap(wordMap);return model;} finally {in.close();}}protected final int readUnsignedShort(InputStream in) throws IOException {int ch1 = in.read();int ch2 = in.read();if ((ch1 | ch2) < 0) {return Integer.MIN_VALUE;}return (ch2 << 8) + (ch1 << 0);}
}

SougouScelMdel.java

import java.util.List;
import java.util.Map;/*** @author dengjh* @create 2016-11-03 9:40**/
public class SougouScelMdel {private Map<String, List<String>> wordMap;private String name;private String type;private String description;private String sample;public Map<String, List<String>> getWordMap() {return wordMap;}void setWordMap(Map<String, List<String>> wordMap) {this.wordMap = wordMap;}public String getType() {return type;}public void setType(String type) {this.type = type;}public String getDescription() {return description;}public void setDescription(String description) {this.description = description;}public String getSample() {return sample;}public void setSample(String sample) {this.sample = sample;}public String getName() {return name;}public void setName(String name) {this.name = name;}}

ParseSogo.java 解析词库文件类

/*** 解析搜狗词库文件** @author dengjh* @create 2016-11-03 9:44**/
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;
public class ParseSogo {public static void main(String[] args)throws Exception {sogou("D:\\scel\\goods.scel","D:\\scel\\goods.txt",true);}/*** 读取scel的词库文件* 生成txt格式的文件* @param inputPath 输入路径* @param outputPath 输出路径* @param isAppend  是否拼接追加词库内容 true 代表追加,false代表重建** **/private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{File file=new File(inputPath);if(!isAppend){if(Files.exists(Paths.get(outputPath),LinkOption.values())){System.out.println("存储此文件已经删除");Files.deleteIfExists(Paths.get(outputPath));}}RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");int count=0;SougouScelMdel model = new SougouScelReader().read(file);Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>Set<Entry<String,List<String>>> set = words.entrySet();Iterator<Entry<String,List<String>>> iter = set.iterator();while(iter.hasNext()){Entry<String,List<String>> entry = iter.next();List<String> list = entry.getValue();int size = list.size();for(int i = 0; i < size; i++){String word = list.get(i);//System.out.println(word);raf.seek(raf.getFilePointer());raf.write((word+"\n").getBytes());//写入txt文件count++;}}raf.close();System.out.println("生成txt成功！,总计写入: "+count+" 条数据！");}}

java解析搜狗词库scel文件到txt相关推荐

爬取词库，使用jieba分词库，自定义dict.txt文件+将搜狗词库.scel文件为.txt文件
一:爬取词库,使用jieba分词库,自定义dict.txt文件 import jiebafrom urllib.request import urlopen from bs4 import Beaut ...
java scel_使用java将搜狗词库.scel文件转化为.txt文件
需求:批量将.scel文件转化为可视的txt文件(支持1对1,多对1,多对多),并从中提取中文词(去重),支持追加内容. 成果: 使用: package com.hxl.files; import j ...
搜狗词库scel格式转为txt格式（python3版本）
1.想用搜狗的词库来辅助jieba分词,需要把词库从scel转成txt格式. 在网上找到了大神的python2版本,https://blog.csdn.net/zhangzhenhu/article/ ...
python读取文本两个数字的成语_只要2步！将搜狗词库(scel)转为Python可读的文本...
该楼层疑似违规已被系统折叠隐藏此楼查看此楼将搜狗词库(scel)转化为python可读的文本(text)的方法方法 1. 利用R语言(方法简单) ① 载入词库(R语言) library(Rword ...
将搜狗词库.scel格式转化为.txt格式
[2020年5月28日更新:有一说一,这篇文章是我2017年底在新浪工作时处理家居.房产频道相关业务时的实践,代码是后来从自己代码库直接粘贴的,当然转码部分的代码是借鉴的,当时也是查阅了几种方法,一一 ...
java通过搜狗词库过滤指定词性,JAVA通过搜狗词库过滤指定词性
http://www.0x32.cn/html/y2010/563.html 在测试过程中需要从文本中拿到指定词性的词,比如名词或者动词,各种词性的定义我们可以依靠搜狗的语料库来实现,从搜狗实验室下载 ...
解析搜狗词库(python)
#!/usr/bin/python # -*- coding: utf-8 -*-import struct import sys import binascii import pdb #搜狗的sce ...
python词库_解析搜狗词库(python)
#!/usr/bin/python # -*- coding: utf-8 -*- import struct import sys import binascii import pdb #搜狗的sc ...
搜狗词库的批量下载#Python
在制作电子病历全文索引时,需要建立索引,索引是根据索引词建立的,现有索引词匮乏,不能满足需求,搜寻之后,发现搜狗输入法的医学词库很庞大,所以,想着自学写一个Python脚本,完成词库的自动下载工作. ...
Scrapy 搜狗词库爬虫
引言最近在学习Python爬虫,这里推荐一个入门爬虫的博客系列 https://github.com/Ehco1996/Python-crawler 博主写的对新手很友好,很适合入门. 我写这篇文章 ...

java解析搜狗词库scel文件到txt

java解析搜狗词库scel文件到txt相关推荐

最新文章

热门文章