Lucene加中文分词paoding调研结果

因为项目原因，调研了下全文检索。网上开源的最流行的检索好像是lucene,nutch据说是稳定性有待测试，所以没试。需要说明的是我要做的这个全文检索是搜索本项目的文档，网页和数据库内容，不涉及到web上网页的搜索。

Lucene介绍的文章很多，重点的都看了，原理不明白的可以看下以下：

http://www.ibm.com/developerworks/cn/java/wa-lucene/

自己参照别人的写了个例子。我用的是lucene2.3.1版本，这里需要注意版本不一样的话写的代码就有所区别，网上很多例子都不是参照新版的写的，不能直接用。

import org.apache.lucene.index.IndexWriter;

import java.io.File;

import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.Searcher;

import java.io.FileReader;

import org.apache.lucene.document.*;

import org.apache.lucene.store.*;

import net.paoding.analysis.analyzer.PaodingAnalyzer;

import java.io.*;

import org.apache.poi.hwpf.extractor.WordExtractor;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.highlight.*;

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

public class Mp3Searcher {

private String DATA_DIR ; //文件的目录（如果是对文件进行索引的话）

private String index_DIR ;//存放索引的目录

private RAMDirectory directory ;

private PaodingAnalyzer analyzer = null;

//private Analyzer analyzer = null;

public Mp3Searcher(){

//Analyzer analyzer = new PaodingAnalyzer();

}

public String getindedxdir()

{

return this.index_DIR;

}

public void buildIndex() throws IOException{

String DATA_DIR="C://lucenetest//index"; //存放文件目录

String index_DIR="E://test"; //存放索引文件目录

File data_Dir = new File(DATA_DIR);

File index_Dir = new File(index_DIR);

this.index_DIR=index_DIR;

Analyzer analyzer = new PaodingAnalyzer();

//Analyzer analyzer = new StandardAnalyzer();

File[] dataFiles = data_Dir.listFiles();

boolean fileIsExist = false;

if (index_Dir.listFiles().length == 0)

fileIsExist = true;

IndexWriter writer = new IndexWriter(index_Dir, analyzer, fileIsExist);

try{

this.doIndex(dataFiles,writer);

}catch(Exception e)

{

e.printStackTrace();

}

writer.optimize();

writer.close();

}

private void doIndex(File[] dataFiles, IndexWriter indexWriter) throws Exception {

for (int i = 0; i < dataFiles.length; i++) {

if (dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".htm"))

{//索引所有htm格式文件

System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());

Reader txtReader = new FileReader(dataFiles[i]);

Document document = new Document();

document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES,Field.Index.UN_TOKENIZED));

document.add(new Field("filename", dataFiles[i].getName(), Field.Store.YES, Field.Index.TOKENIZED));

document.add(new Field("contents", txtReader));

indexWriter.addDocument(document);

}

else if (dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".doc"))

{

FileInputStream in = new FileInputStream(dataFiles[i]);//获得文件流

WordExtractor extractor = new WordExtractor(in);//使用POI对word文件进行解析

String str = extractor.getText();//返回String

Document document = new Document();//生成Document对象,其中有3个Field,分别是path,filename,contents

document.add(new Field("path", dataFiles[i].getCanonicalPath(), Field.Store.YES,Field.Index.UN_TOKENIZED));

document.add(new Field("filename", dataFiles[i].getName(), Field.Store.YES, Field.Index.TOKENIZED));

document.add(new Field("contents", str, Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));

System.out.print(document.getField("path").toString()+document.getField("filename").toString()+document.getField("contents").toString());

indexWriter.addDocument(document);

}

else if (dataFiles[i].isDirectory())

doIndex(dataFiles[i].listFiles(), indexWriter);//使用递归,继续索引文件夹

}

// public void searchIndex(String curcontents,String curfilename) throws Exception {

public void searchIndex(String curcontents) throws Exception {

String contents = curcontents;//内容的关键字

// String filename = curfilename;//文件名的关键字

File indexDir = new File(index_DIR);//存放索引的文件夹

FSDirectory directory = FSDirectory.getDirectory(indexDir);

Searcher searcher = new IndexSearcher(directory);

Analyzer analyzer = new PaodingAnalyzer();

//Analyzer analyzer = new StandardAnalyzer();

QueryParser parserContents = new QueryParser("contents", analyzer);

Query query1 = parserContents.parse(contents);

// QueryParser parserFilename = new QueryParser("filename", analyzer); //使用同一个分析器luceneAnalyzer分别生成两个QueryParser对象

// Query query2 = parserFilename.parse(filename);

BooleanQuery query = new BooleanQuery();

query.add(query1, BooleanClause.Occur.MUST);

// query.add(query2, BooleanClause.Occur.MUST);

//SimpleHTMLFormatter formatter =new SimpleHTMLFormatter("<span class=/"highlight/">","</span>");

// SimpleHTMLFormatter formatter =new SimpleHTMLFormatter();

// Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));

Highlighter highlighter = new Highlighter(new QueryScorer(query));

highlighter.setTextFragmenter(new SimpleFragmenter(60)); //Lucene自带的高亮功能

Hits hits = searcher.search(query);

for(int i=0;i<hits.length();i++){

Document doc=hits.doc(i);

System.out.println("检索文件"+doc.get("path"));

String contents1=doc.get("contents");

if (contents1!=null)

{

TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(contents1));

String str = highlighter.getBestFragment(tokenStream,hits.doc(i).get("contents"));

System.out.println(str);

}

public static void main(String[] args) throws Exception

{

Mp3Searcher searcher=new Mp3Searcher();

String DATA_DIR="C://lucenetest//index";

String index_DIR="E://test";

searcher.buildIndex();

//searcher.searchIndex("lucene","绝缘材料");

searcher.searchIndex("索引");

}

以上代码已经在eclipse3.2上测试通过，建的是java项目。代码中有些注释的部分也是能用的，稍微修改下就行。

下面重点说下中文分词。因为lucene是老外写的，所以它对英文，德文等支持比较好。对于中文的功能是比较弱的。中文分词要用standardanalyzer,这个是采用二元分词法，分词效果不理想，所以国人也开发了很多中文分词的软件。网上很有名的就是中科院的，可是中科院的这个是c写的，也有人把它改为java，但是据说bug很多，所以我也没下。我稍微看了下好像是说中科院的这个分词的dll要用jni用在lucene.net这个版本上，但是这个版本已经旧了，所以我没调研。其他的中文分词软件我主要看了下庖丁解牛分词软件。

首先说明这个软件在java项目中是可以使用的，问题主要在web项目中。

http://paoding.googlecode.com/svn/trunk/paoding-analysis/

最好是下svn上的，因为这个是更新过的，更改了2.0.4版的一些bug。

http://groups.google.com/group/paoding/topics?start=20&sa=N是论坛

网上也有各种出现的各种错误，主要是如下的，我都碰到过：

首先是需要设置 Paoding 词典到我们的系统环境变量 , 该词典就在 Paoding 的 dic 文件夹里 , 把它设置到环境变量中 , 变量名是 PAODING_DIC_HOME, 这里要注意 :DIC 的路径不能包含中文 , 标点 , 空格 ...

然后 , 再使用的时候 , 你需要将 Paoding 的 5 个 jar 包拷贝到你的工程中去 , 但是要注意你工程的路径 , 如果你发布的是 WEB 工程 , 记得你的服务器路径也不能包含中文空格 , 之前我的路径是

g:/Tomcat 6.0 就不行 , 被迫改成了 g:/Tomcat

如果上面设置没问题 , 就可以使用 Paoding 进行搜索开发了 , 但是在建立索引的时候你可能还会遇到 java.io.File.setWritable(Z)Z 异常问题 , 这个一般是你 JDK 的问题 , 升级下你 JDK 的版本 .

我就遇到过很奇怪的问题 , 我使用 JDK1.5 一直没问题 , 但是有一天突然爆了这个错误 , 让我郁闷 , 最后升级了个 JDK, 问题就解决了 ....

以上是网上别人反应的问题，我自己碰到的问题主要是这几个：

1. Classpath 的路径 paoding 总是不识别，导致无法正确加载词库。无论在 properties 文件中如何设置都搞不好，也许是 jdk 版本问题，我用的是 1.5 的。这样导致在部署到 tomcat5.0 时总是报路径不对。

2. 我建了个 jsp 和 servlet ，将结果显示在 servlet 中。结果发现在 jsp 页面上输入中文关键字时页面无任何结果，但是搜英文的就有结果。

这个问题很多人看了都认为是个乱码处理问题，但是我调了很久，各种方式都试了，仍然不行。网上也有人反映和我一样的问题，但是没有解决办法。

这两个重大问题导致我放弃了 paoding ，还有一点是分词软件是需要维护的，网上的成功的基于 lucene 的网站如 jlive 等，我想它们可能是购买的某公司的中文分词的解析器来做搜索的。

备注：本人编程能力比较差，暂时只能认识到这个层次。

Lucene加中文分词paoding调研结果相关推荐

如何在基于Lucene的中文分词器中添加自定义词典（如Paoding、mmseg4j、IK Analyzer）...
如何在基于Lucene的中文分词器中添加自定义词典(如Paoding.mmseg4j.IK Analyzer) 2013-07-08 21:54:29| 分类: 计算机 |字号订阅 1. 使用Pa ...
Lucene支持中文分词代码实现
支持中文分词分析器(Analyzer)的执行过程如下图是语汇单元的生成过程: 从一个Reader字符流开始,创建一个基于Reader的Tokenizer分词器,经过三个TokenFilter生成语 ...
（转）全文检索技术学习(三)——Lucene支持中文分词
http://blog.csdn.net/yerenyuan_pku/article/details/72591778 分析器(Analyzer)的执行过程如下图是语汇单元的生成过程: 从一个 ...
Nutch 分词中文分词 paoding 疱丁
Nutch中文分词总结 2009年06月10日星期三 22:38 1 中文分词介绍中文分词是在做检索类系统时需要重点考虑的一个因素.Nutch的本土化过程也需要更改对中文分词的支持.目前,Nutc ...
Lucene.net中文分词探究
一.中文分词方式: 中文分词几种常用的方式: A．单字分词单字分词,顾名思义,就是按照中文一个字一个字地进行分词.如:我们是中国人,效果:我/们/是/中/国/人. B．二分法二分法,就是按两个 ...
向Lucene增加中文分词功能
一.分词功能介绍分词模块对于搜索的重要性不言而喻.例如,没有分词时,搜索"和服"会出现"产品和服务",搜索"海尔"会出现"海尔德 ...
[更新中]Lucene.net,中文分词技术 ICTCLAS研究
http://groups.google.com/group/ictclas http://blog.csdn.net/sinboy/archive/2006/03/12/622596.aspx ht ...
lucene可用中文分词IKAnalyzer,maven pom下载代码及配置文件
1.pom代码 <dependency><groupId>com.janeluo</groupId><artifactId>ikanalyzer< ...
Lucene中文分词Demo
本文记录Lucene+Paoding的使用方法图解: 一.下载Lucene(官网:http://archive.apache.org/dist/lucene/java/)本文中使用的是:2.9.4,下 ...

Lucene加中文分词paoding调研结果

Lucene加中文分词paoding调研结果相关推荐

最新文章

热门文章