Lucene 5 TokenStream

2019独角兽企业重金招聘Python工程师标准>>>

package com.lucene5.dream;import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;import org.ansj.lucene5.AnsjAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;import com.lucene5.demo.LuceneQueryTest1;public class TokenStreamTest {static Analyzer analyzer;static Directory d;static IndexWriterConfig conf;static IndexWriter indexWriter;final static String queryKeyWord1 = "华美";private static final FieldType DOUBLE_FIELD_TYPE_STORED_SORTED = new FieldType();static {DOUBLE_FIELD_TYPE_STORED_SORTED.setTokenized(true);DOUBLE_FIELD_TYPE_STORED_SORTED.setOmitNorms(true);DOUBLE_FIELD_TYPE_STORED_SORTED.setIndexOptions(IndexOptions.DOCS);DOUBLE_FIELD_TYPE_STORED_SORTED.setNumericType(FieldType.NumericType.DOUBLE);DOUBLE_FIELD_TYPE_STORED_SORTED.setStored(true);DOUBLE_FIELD_TYPE_STORED_SORTED.setDocValuesType(DocValuesType.NUMERIC);DOUBLE_FIELD_TYPE_STORED_SORTED.freeze();}@BeforeClasspublic static void setup() throws Exception {analyzer = new AnsjAnalyzer("user");d = new RAMDirectory();conf = new IndexWriterConfig(analyzer);indexWriter = new IndexWriter(d, conf);InputStream is = LuceneQueryTest1.class.getResourceAsStream("/data/data");BufferedReader br = new BufferedReader(new InputStreamReader(is));String line = null;while ((line = br.readLine()) != null) {String[] elements = line.split("##");Document document = new Document();StringField category = new StringField("category", elements[0], Store.YES);TextField brandName = new TextField("brandName", elements[1], Store.YES);TextField productName = new TextField("productName", elements[2], Store.YES);DoubleField price = new DoubleField("price", Double.valueOf(elements[3]), DOUBLE_FIELD_TYPE_STORED_SORTED);document.add(category);document.add(brandName);document.add(productName);document.add(price);indexWriter.addDocument(document);}indexWriter.commit();indexWriter.close();br.close();is.close();}@AfterClasspublic static void teardown() {try {indexWriter.close();} catch (IOException e) {e.printStackTrace();}}@Testpublic void testSearchBySort() {StringReader reader = new StringReader("我爱北京天安门，我在天安门广场上看见很多人");TokenStream tokenStream = null;try {tokenStream = analyzer.tokenStream("message", reader);CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);FlagsAttribute flagsAttribute = tokenStream.getAttribute(FlagsAttribute.class);PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);PayloadAttribute payloadAttribute = tokenStream.getAttribute(PayloadAttribute.class);OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);tokenStream.reset();while (tokenStream.incrementToken()) {System.err.print("##");System.err.print("charTermAttribute:" + charTermAttribute.toString());System.err.print("type:" + typeAttribute.type());// System.err.print(flagsAttribute.getFlags());System.err.print(positionIncrementAttribute.getPositionIncrement());// System.err.print(payloadAttribute.getPayload());System.err.print("start:" + offsetAttribute.startOffset());System.err.print("end:" + offsetAttribute.endOffset());System.err.print("##");System.err.println();}tokenStream.end();} catch (Exception e) {e.printStackTrace();} finally {try {tokenStream.close();} catch (IOException e) {e.printStackTrace();}analyzer.close();}}}/*** 存储了每一个索引位置的有效载荷。他们被生成有用的评分信息在有效信息查询的时候。他们在不同的位置都有存储OffsetAttribute：* * 这个记录当前的term在文档中的起始和结束位置* * TypeAttribute* * 定义数据类型* * FlagsAttribute* * 和TypeAttribute类似。但是他有别的用途* * Suppose you need to add specific information about a token and that* information should be available down the analyzer chain, you can pass it as* flags. TokenFilters can perform any specific action based on the flags of the* token* * 假设你有其他的特殊信息。并且这些信息必须在分析器链上沉淀下来。你可以 像flag跳过他们。TokenFilter 可以被执行任何页数的动作在token的* 标志上* * PayloadAttribute* * This stores the payload at each index position and is generally useful in* scoring when used with Payload-based queries. Because it's stored at each* position, it is best to have a minimum number of bytes per term in the index* to minimize overloading the index with a massive amount of data.*/

转载于:https://my.oschina.net/payzheng/blog/627745

Lucene 5 TokenStream相关推荐

Lucene（8_2_0）核心API学习之 TokenStream（一）
一.继承 org.apache.lucene.analysis.TokenStream entends org.apache.lucene.util.AttributeSource 二.详情 Tok ...
lucene分词器中的Analyzer,TokenStream, Tokenizer, TokenFilter
分词器的核心类: Analyzer: 分词器 TokenStream: 分词器做好处理之后得到的一个流.这个流中存储了分词的各种信息,可以通过TokenStream有效的获取到分词单元. 以下是把文件 ...
Lucene学习-深入Lucene分词器,TokenStream获取分词详细信息
Lucene学习-深入Lucene分词器,TokenStream获取分词详细信息在此回复牛妞的关于程序中分词器的问题,其实可以直接很简单的在词库中配置就好了,Lucene中分词的所有信息我们都可以从 ...
lucene构建同义词分词器
lucene4.0版本号以后已经用TokenStreamComponents 代替了TokenStream流.里面包含了filter和tokenizer 在较复杂的lucene搜索业务场景下,直接网 ...
【Java】Lucene检索引擎详解
基于Java的全文索引/检索引擎--Lucene Lucene不是一个完整的全文索引应用,而是是一个用Java写的全文索引引擎工具包,它可以方便的嵌入到各种应用中实现针对应用的全文索引/检索功能. L ...
Lucene教程具体解释
注明:本文是由本人在开发有关基于lucene资源检索系统时的一点总结,当中一部分是自己依据开发过程自己总结的,也有部分是摘自网络,因无法获取当时摘文的地址,所以在此没有写源地址. 转载请声明出处 Lu ...
向Lucene增加中文分词功能
一.分词功能介绍分词模块对于搜索的重要性不言而喻.例如,没有分词时,搜索"和服"会出现"产品和服务",搜索"海尔"会出现"海尔德 ...
Lucene入门教程
Lucene教程 1 lucene简介 1.1 什么是lucene Lucene是一个全文搜索框架,而不是应用产品.因此它并不像www.baidu.com 或者google Desktop那么 ...
Lucene进阶操作，单字段、多字段和布尔搜索
上一篇文章是入门,现在是进阶,通过项目用到的例子给大家呈现较好的方法.本方法基于Lucene-4.7.2因为项目要用到JDK1.6. 这里使用几个很好用的类,是经过多次试验整理出来的,分别是单字段搜索 ...

Lucene 5 TokenStream

Lucene 5 TokenStream相关推荐

最新文章

热门文章