searcher.Search(types.SearchRequest{Text: "百度中国"})

// 查找满足搜索条件的文档,此函数线程安全
func (engine *Engine) Search(request types.SearchRequest) (output types.SearchResponse) {if !engine.initialized {log.Fatal("必须先初始化引擎")}var rankOptions types.RankOptionsif request.RankOptions == nil {rankOptions = *engine.initOptions.DefaultRankOptions} else {rankOptions = *request.RankOptions}if rankOptions.ScoringCriteria == nil {rankOptions.ScoringCriteria = engine.initOptions.DefaultRankOptions.ScoringCriteria}// 收集关键词tokens := []string{}if request.Text != "" {querySegments := engine.segmenter.Segment([]byte(request.Text))for _, s := range querySegments {token := s.Token().Text()if !engine.stopTokens.IsStopToken(token) {tokens = append(tokens, s.Token().Text())}}} else {for _, t := range request.Tokens {tokens = append(tokens, t)}}// 建立排序器返回的通信通道rankerReturnChannel := make(chan rankerReturnRequest, engine.initOptions.NumShards)// 生成查找请求lookupRequest := indexerLookupRequest{countDocsOnly:       request.CountDocsOnly,tokens:              tokens,labels:              request.Labels,docIds:              request.DocIds,options:             rankOptions,rankerReturnChannel: rankerReturnChannel,orderless:           request.Orderless,}// 向索引器发送查找请求for shard := 0; shard < engine.initOptions.NumShards; shard++ {engine.indexerLookupChannels[shard] <- lookupRequest}// 从通信通道读取排序器的输出numDocs := 0rankOutput := types.ScoredDocuments{}timeout := request.TimeoutisTimeout := falseif timeout <= 0 {// 不设置超时for shard := 0; shard < engine.initOptions.NumShards; shard++ {rankerOutput := <-rankerReturnChannelif !request.CountDocsOnly {for _, doc := range rankerOutput.docs {rankOutput = append(rankOutput, doc)}}numDocs += rankerOutput.numDocs}} else {// 设置超时deadline := time.Now().Add(time.Millisecond * time.Duration(request.Timeout))for shard := 0; shard < engine.initOptions.NumShards; shard++ {select {case rankerOutput := <-rankerReturnChannel:if !request.CountDocsOnly {for _, doc := range rankerOutput.docs {rankOutput = append(rankOutput, doc)}}numDocs += rankerOutput.numDocscase <-time.After(deadline.Sub(time.Now())):isTimeout = truebreak}}}// 再排序if !request.CountDocsOnly && !request.Orderless {if rankOptions.ReverseOrder {sort.Sort(sort.Reverse(rankOutput))} else {sort.Sort(rankOutput)}}// 准备输出output.Tokens = tokens// 仅当CountDocsOnly为false时才充填output.Docsif !request.CountDocsOnly {if request.Orderless {// 无序状态无需对Offset截断output.Docs = rankOutput} else {var start, end intif rankOptions.MaxOutputs == 0 {start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))end = len(rankOutput)} else {start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))}output.Docs = rankOutput[start:end]}}output.NumDocs = numDocsoutput.Timeout = isTimeoutreturn
}

索引器接受查找请求:

func (engine *Engine) indexerLookupWorker(shard int) {for {request := <-engine.indexerLookupChannels[shard] // 关键var docs []types.IndexedDocumentvar numDocs int        if request.docIds == nil {      docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, nil, request.countDocsOnly)} else {               docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, request.docIds, request.countDocsOnly)}if request.countDocsOnly {      request.rankerReturnChannel <- rankerReturnRequest{numDocs: numDocs}continue           }if len(docs) == 0 {    request.rankerReturnChannel <- rankerReturnRequest{}continue           }                      if request.orderless { var outputDocs []types.ScoredDocumentfor _, d := range docs {        outputDocs = append(outputDocs, types.ScoredDocument{DocId: d.DocId,                 TokenSnippetLocations: d.TokenSnippetLocations,TokenLocations:        d.TokenLocations})}request.rankerReturnChannel <- rankerReturnRequest{docs:    outputDocs,numDocs: len(outputDocs),}continue}rankerRequest := rankerRankRequest{countDocsOnly:       request.countDocsOnly,docs:                docs,options:             request.options,rankerReturnChannel: request.rankerReturnChannel,}engine.rankerRankChannels[shard] <- rankerRequest}
}

lookup函数实现:

// 查找包含全部搜索键(AND操作)的文档
// 当docIds不为nil时仅从docIds指定的文档中查找
func (indexer *Indexer) Lookup(tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {if indexer.initialized == false {log.Fatal("索引器尚未初始化")}indexer.DocInfosShard.RLock()defer indexer.DocInfosShard.RUnlock()if indexer.DocInfosShard.NumDocuments == 0 {return}numDocs = 0// 合并关键词和标签为搜索键keywords := make([]string, len(tokens)+len(labels))copy(keywords, tokens)copy(keywords[len(tokens):], labels)indexer.InvertedIndexShard.RLock()table := make([]*types.KeywordIndices, len(keywords))for i, keyword := range keywords {indices, found := indexer.InvertedIndexShard.InvertedIndex[keyword]if !found {// 当反向索引表中无此搜索键时直接返回
            indexer.InvertedIndexShard.RUnlock()return} else {// 否则加入反向表中table[i] = indices}// 当没有找到时直接返回if len(table) == 0 {indexer.InvertedIndexShard.RUnlock()return}// 归并查找各个搜索键出现文档的交集// 从后向前查保证先输出DocId较大文档indexPointers := make([]int, len(table))for iTable := 0; iTable < len(table); iTable++ {indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1}// 平均文本关键词长度,用于计算BM25avgDocLength := indexer.InvertedIndexShard.TotalTokenLength / float32(indexer.DocInfosShard.NumDocuments)indexer.InvertedIndexShard.RUnlock()for ; indexPointers[0] >= 0; indexPointers[0]-- {// 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档baseDocId := indexer.getDocId(table[0], indexPointers[0])// 全局范围查找目标文档是否存在if _, ok := indexer.DocInfosShard.DocInfos[baseDocId]; !ok {// if !IsDocExist(baseDocId) {// 文档信息中不存在反向索引文档时,跳过// 该情况由不对称删除操作所造成continue}if docIds != nil {_, found := docIds[baseDocId]if !found {continue}}iTable := 1found := truefor ; iTable < len(table); iTable++ {// 二分法比简单的顺序归并效率高,也有更高效率的算法,// 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现// 以避免反向表添加新文档时的写锁。// TODO: 进一步研究不同求交集算法的速度和可扩展性。position, foundBaseDocId := indexer.searchIndex(table[iTable],0, indexPointers[iTable], baseDocId)if foundBaseDocId {indexPointers[iTable] = position} else {if position == 0 {// 该搜索键中所有的文档ID都比baseDocId大,因此已经没有// 继续查找的必要。return} else {// 继续下一indexPointers[0]的查找indexPointers[iTable] = position - 1found = falsebreak}}}if found {indexedDoc := types.IndexedDocument{}// 当为LocationsIndex时计算关键词紧邻距离if indexer.initOptions.IndexType == types.LocationsIndex {// 计算有多少关键词是带有距离信息的numTokensWithLocations := 0for i, t := range table[:len(tokens)] {if len(t.Locations[indexPointers[i]]) > 0 {numTokensWithLocations++}}if numTokensWithLocations != len(tokens) {if !countDocsOnly {docs = append(docs, types.IndexedDocument{DocId: baseDocId,})}numDocs++break}// 计算搜索键在文档中的紧邻距离tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)indexedDoc.TokenProximity = int32(tokenProximity)indexedDoc.TokenSnippetLocations = tokenLocations// 添加TokenLocationsindexedDoc.TokenLocations = make([][]int, len(tokens))for i, t := range table[:len(tokens)] {indexedDoc.TokenLocations[i] = t.Locations[indexPointers[i]]}}// 当为LocationsIndex或者FrequenciesIndex时计算BM25if indexer.initOptions.IndexType == types.LocationsIndex ||indexer.initOptions.IndexType == types.FrequenciesIndex {bm25 := float32(0)d := indexer.DocInfosShard.DocInfos[baseDocId].TokenLengthsfor i, t := range table[:len(tokens)] {var frequency float32if indexer.initOptions.IndexType == types.LocationsIndex {frequency = float32(len(t.Locations[indexPointers[i]]))} else {frequency = t.Frequencies[indexPointers[i]]}// 计算BM25if len(t.DocIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {// 带平滑的idfidf := float32(math.Log2(float64(indexer.DocInfosShard.NumDocuments)/float64(len(t.DocIds)) + 1))k1 := indexer.initOptions.BM25Parameters.K1b := indexer.initOptions.BM25Parameters.Bbm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))}}indexedDoc.BM25 = float32(bm25)}indexedDoc.DocId = baseDocIdif !countDocsOnly {docs = append(docs, indexedDoc)}numDocs++}}return
}

转载于:https://www.cnblogs.com/bonelee/p/6582673.html

wukong引擎源码分析之搜索——docid有序的数组里二分归并求交集,如果用跳表的话,在插入索引时会更快...相关推荐

  1. wukong引擎源码分析之索引——part 1 倒排列表本质是有序数组存储

    searcher.IndexDocument(0, types.DocumentIndexData{Content: "此次百度收购将成中国互联网最大并购"}) engine.go ...

  2. wukong引擎源码分析之索引——part 3 文档评分 无非就是将docid对应的fields信息存储起来,为搜索结果rank评分用...

    之前的文章分析过,接受索引请求处理的代码在segmenter_worker.go里: func (engine *Engine) segmenterWorker() {for {request := ...

  3. wukong引擎源码分析之索引——part 2 持久化 直接set(key,docID数组)在kv存储里...

    前面说过,接收indexerRequest的代码在index_worker.go里: func (engine *Engine) indexerAddDocumentWorker(shard int) ...

  4. 虚幻引擎源码分析(5)

    虚幻引擎源码分析(5)

  5. Elasticsearch源码分析—线程池(十一) ——就是从队列里处理请求

    Elasticsearch源码分析-线程池(十一) 转自:https://www.felayman.com/articles/2017/11/10/1510291570687.html 线程池 每个节 ...

  6. 悟空分词的搜索和排序源码分析之——搜索

    转自:http://blog.codeg.cn/2016/02/02/wukong-source-code-reading/ 搜索过程分析 下面我们来分析一下搜索的过程.首先构造一个SearchReq ...

  7. 白鹭php源码,egret 2D引擎源码分析(二) 创建播放器

    本帖最后由 fightingcat 于 2016-7-16 00:26 编辑 上一篇讲到了引擎的入口runEgret为每一个播放器标签(就是index.html中看到的那个 之前web.WebPlay ...

  8. 悟空分词与mysql结合_悟空分词的搜索和排序源码分析之——搜索

    转自:http://blog.codeg.cn/2016/02/02/wukong-source-code-reading/ 搜索过程分析 下面我们来分析一下搜索的过程.首先构造一个SearchReq ...

  9. 以太坊共识引擎源码分析

    这一篇分析以太坊的共识引擎,先看一下各组件之间的关系: Engine接口定义了共识引擎需要实现的所有函数,实际上按功能可以划分为2类: 区块验证类:以Verify开头,当收到新区块时,需要先验证区块的 ...

最新文章

  1. Jupyter官方神器:可视化 Debug 工具!
  2. java 调试 gdb_android gdb 调试实例演示(有源代码篇)
  3. SpringMVC中@RequestParam(username)
  4. Instagram 在 PyCon 2017 的演讲摘要
  5. java接口有非抽象方法_如果一个类没有实现Java接口的所有抽象方法,会发生什么?...
  6. WPF 密码框水印与明文切换
  7. leetcode:Majority Number
  8. 敏捷开发一千零一问系列之十二:敏捷实施的步骤?
  9. Beautiful选择器/遍历文档树Day3-7
  10. 初识Loadrunner
  11. 二维码中间嵌入logo
  12. 什么是MXF文件?将MXF转为MP4格式的方法
  13. qtdesigner设计表格_Qt Designer下的一些基础操作
  14. 2021-07-11 layer与tier的区别(英语)
  15. 函数式编程对象Either
  16. android四大组件
  17. 在word中对学位论文进行页码和页眉设置的方法
  18. java(tm)6 update 45_Java(TM) 6 Update
  19. try {}里有一个return语句,那么紧跟在这个try后的finally {}里的code会不会被执行,什么时候被执行,在return前还是后
  20. 定义复数类Complex,重载运算符“+”,使之用于复数的加法运算

热门文章

  1. 汇编 int 10h
  2. 快应用 - 应用签名校验失败
  3. c mysql存储过程实例_MySQL存储过程实例
  4. python获取文本光标_python 文件的操作以及调整光标
  5. 保存图像_设计干货知识:SVG vs PNG vs JPG|图像格式的优缺点
  6. plsql objects 过一段时间就会未连接oracle_记一次生产数据库故障排查--连接管理等待事件...
  7. python函数名的语法_Python 基础语法六 ——函数
  8. 怎么查看linux是不是as7u4,Linux下搭建Android开发环境
  9. 【机器学习】Apriori 算法进行关联分析和FP-growth算法
  10. python【Matlibplot绘图库】-主要概念