倒排索引的理论和代码实现

一、倒排索引概念

倒排索引常使用在搜索引擎当中，是搜索引擎为文档内容建立索引，实现内容快速检索必不可少的数据结构
倒排索引是由单词的集合“词典”和倒排列表的集合“倒排文件”组成的
倒排索引的存储：内存索引和B+树索引
理解正排索引结构和倒排索引结构；掌握词典、倒排项，倒排列表的具体实现

我们在打开搜索网站，我们输入关键字以后，浏览器作为客户端把关键字发到对应的搜索引擎服务器server端，服务器去分析这个关键字，在全网几千万，乃至上亿个所有HTML网页中快速找到我们想要的内容，然后把相关的页面发送给客户端

既然要搜索海量的网页，如果在数据库花费磁盘I/O去搜索，那应该花费很长事件吧？

其实不是的，服务器可以在ms级别的时间内把页面给我们返回回来，这都是因为搜索引擎（比如ElasticSearch）的服务器会提前对全网的页面的内容进行分词，看看有哪些关键字，对这些关键字建立了倒排索引，利用倒排索引搜索所有出现过关键字的HTML页面

比如我们有两个文档（好比是两个HTML页面）

正排索引结构如下：行表示文档，列表示关键词

我们遍历的时候都是按行遍历，如果使用正排索引，就需要遍历完所有的文档，才能知道关键词engine在哪些文档中出现，效率太低

由于我们都是用关键字搜索，那我们把关键字作为行，文档作为列，这就是倒排索引：

这样我们就能在O(1)O(1)O(1)的时间内知道，关键词engine在哪些文档中出现

搜索引擎会把各个文档排一个优先级，按照优先级给用户展示，优先级会和很多因素相关：比如竞价、关键词匹配程度、关键词出现频率、网站权威性等等

二、倒排索引结构

词典：倒排索引结构的左边那一部分，包含了从所有文档中抽取出来的关键字

倒排项：关键词所在文档的描述信息，也就是图中的P1、P2，包括文档的标识、关键词出现频率、关键词出现的位置

倒排列表：关键词可能会出现在很多文档中，这些关键词对应的倒排项集合就是倒排列表

倒排文件：由很多倒排列表组成，用于数据持久化

倒排文件存在磁盘上的数据进行持久化，我们把磁盘上的数据读出来，组织在B+树上，磁盘I/O次数少，搜索速度快，我们实际使用倒排索引的时候，我们直接把倒排索引存到数据库就可以了，关系型数据库天然的实现了磁盘的读取，B+树的结构，我们就不用自己实现了

使用倒排索引搜索的方法：

搜索单个单词：直接在词典里找到这个单词，然后就可以找到所有的倒排项，就能知道这个单词出现在哪些文档里，出现的频率以及位置等信息
搜索一个句子：比如我们搜索“search google”，搜索引擎会按照一定的分词算法进行分词，根据分词的结果到词典中进行匹配，然后获取关键词的倒排项，最终需要给所有的倒排项打分并展示给用户。比如可以先求倒排列表交集，然后判断单词在文档中的位置来计算匹配程度，最终得到展示给用户的优先级

搜索注意事项：无论用户搜索单数还是复数，大写还是小写，我们都应该给出相应的结果。用户用小写搜索，我们需要把大写给展示给用户，用户用单数搜索，我们需要把复数给展示给用户。还有包括近义词等都需要注意展示给用户

#include <iostream>
#include <memory>
#include <string>
#include <algorithm>
#include <vector>
#include <list>
#include <map>
#include <unordered_map>
#include <queue>
#undef UNICODE
#include <Windows.h>using namespace std;// 倒排项
struct InvertTerm {InvertTerm(string docid, int freqs, int location): docid_(docid), freqs_(freqs){locations_.emplace_back(location);}// 当多个倒排列表求交集的时候，需要比较两个倒排项是否相等，不同的词出现在同一个文件中，就认为倒排项相等bool operator==(const InvertTerm& term) const {return docid_ == term.docid_;}bool operator<(const InvertTerm& term) const {return docid_ < term.docid_;}string docid_;              // 单词所在的文档int freqs_;                 // 单词在文档内出现的次数vector<int> locations_;       // 单词在文档内出现的位置
};// 倒排列表
class InvertList {public:// 添加倒排项void add_term(string docid, int location) {for (auto& term : term_list_) {if (term.docid_ == docid) {// 倒排项（一个文件）已经在倒排列表中存在，一个倒排项表示一个文件的详细信息term.freqs_++;term.locations_.emplace_back(location);return;}}// 第一次创建docid文档的倒排项term_list_.emplace_back(InvertTerm(docid, 1, location));}// 获取倒排列表的内容const vector<InvertTerm>& get_invert_list() const {return term_list_;}
private:vector<InvertTerm> term_list_;
};// 倒排索引
class InvertIndex {public:// 设置文档搜索根路径void set_search_path(string path) {cout << "搜索文件..." << endl;get_all_file(path.c_str());cout << "完成！" << endl;cout << "开始创建倒排索引";create_invert_index();cout << "完成！" << endl;}// 设置过滤后缀void add_suffix(string suffix) {suffixs_.push_back(suffix);}// 查询接口void query(string phrase) {// 先分词vector<string> word_list;// strtok将分割字符替换成\0string input_str = phrase;char* word = strtok(const_cast<char*>(phrase.c_str()), " ");while (word != nullptr) {// 过滤word前后的空白字符word = trim(word);if (strlen(word) > 0) {word_list.emplace_back(word);}word = strtok(nullptr, " ");}// phrase全是空白字符，分割完后没有结果if (word_list.empty()) {return;}// 分割完后只有一个词if (word_list.size() == 1) {auto iter = invert_map_.find(word_list[0]);if (iter == invert_map_.end()) {cout << "未找到任何匹配的内容！" << endl;return;}for (auto& term : iter->second.get_invert_list()) {// 打印倒排列表中所有的倒排项cout << term.docid_ << " freqs:" << term.freqs_ << endl;cout << input_str << "出现的位置：";for (int location : term.locations_) {cout << location << " ";}cout << endl;}}else{// 分割完后有多个词// invert_lists存储每个词对应的倒排列表vector<InvertList> invert_lists;for (int i = 0; i < word_list.size(); i++) {auto iter = invert_map_.find(word_list[i]);if (iter != invert_map_.end()) {invert_lists.emplace_back(iter->second);}}//invert_lists每个元素就是一个倒排列表，求这些列表的交集vector<InvertTerm> common_terms;vector<InvertTerm> v1(invert_lists[0].get_invert_list().begin(), invert_lists[0].get_invert_list().end());sort(v1.begin(), v1.end());for (int i = 1; i < invert_lists.size(); i++) {vector<InvertTerm> v2(invert_lists[i].get_invert_list().begin(), invert_lists[i].get_invert_list().end());sort(v2.begin(), v2.end());// set_intersection求交集的时候要求集合有序，存放的结果序列是从第一个序列中复制的set_intersection(v1.begin(), v1.end(), v2.begin(), v2.end(), back_inserter(common_terms));v1.swap(common_terms);common_terms.clear();}// 此时v1就存放了用户输入多个词都出现的文档（倒排项）// 其实这里还应该判断input_str的每个词在倒排项中是否是连续的for (auto& term : v1) {// 打印倒排列表中所有的倒排项cout << term.docid_ << " freqs:" << term.freqs_ << endl;cout << input_str << "出现的位置：";for (int location : term.locations_) {cout << location << " ";}cout << endl;}}}private:// 给file_list_中的文件创建倒排索引void create_invert_index() {for (string file_path : file_list_) {cout << ".";FILE* fp = fopen(file_path.c_str(), "r");if (nullptr == fp) {cerr << file_path << "打开失败！" << endl;continue;}// 按行读取文件的内容，并按照空格分词int location = 0;const int LINE_SIZE = 2048;char line_content[LINE_SIZE] = { 0 };while (!feof(fp)) {vector<string> line_word_list;          // 存放某个文件中一行的单词// 读一行文件内容fgets(line_content, LINE_SIZE, fp);// 按照空格分词char* word = strtok(line_content, " ");while (word != nullptr) {// 过滤word前后的空白字符word = trim(word);if (strlen(word) > 0) {line_word_list.emplace_back(word);}word = strtok(nullptr, " ");  // line_content已经被记录了，不需要再传}// 一行数据分割完成// 开始给word_list里面的单词创建或修改倒排列表for (string w : line_word_list) {location++;auto iter = invert_map_.find(w);if (iter == invert_map_.end()) {// w没有出现在词典invert_map_，词典需要加入新的单词，创建倒排列表InvertList list;list.add_term(file_path, location);invert_map_.emplace(w, list);}else {// w 存在于词典invert_map_中，已经有了倒排列表，需要添加倒排项iter->second.add_term(file_path, location);}}}fclose(fp);}}// 去掉分词后的单词，前后多余的空白字符char* trim(char* word) {int i = 0;int j = i;while (word[j] != '\0') {j++;}// j指向\0j--;while (i <= j) {bool is_modify_i = false;bool is_modify_j = false;if (word[i] == ' ' || word[i] == '\t' || word[i] == '\n') {i++;is_modify_i = true;}if (word[j] == ' ' || word[j] == '\t' || word[j] == '\n') {j--;is_modify_j = true;}if (!is_modify_i && !is_modify_j) {break;}}word[j + 1] = '\0';return word + i;}// 递归找到path目录下所有指定后缀的文件int get_all_file(const char* path) {char szFind[MAX_PATH];WIN32_FIND_DATA FindFileData;strcpy(szFind, path);strcat(szFind, "\\*.*");HANDLE hFind = FindFirstFile(szFind, &FindFileData);if (INVALID_HANDLE_VALUE == hFind) {return -1;}do {if (FindFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {if (strcmp(FindFileData.cFileName, ".") != 0 && strcmp(FindFileData.cFileName, "..") != 0) {// 发现子目录，开始递归char szFile[MAX_PATH] = { 0 };strcpy(szFile, path);strcat(szFile, "\\");strcat(szFile, FindFileData.cFileName);get_all_file(szFile);}}else {// 找到文件，处理string file_name(FindFileData.cFileName);for (string suffix : suffixs_) {int pos = file_name.find(suffix);if (pos != string::npos && pos + suffix.size() == file_name.size()) {// 找到指定后缀string file_path(path);file_path.append("\\");file_path.append(file_name);file_list_.emplace_back(file_path);break;}}}} while (FindNextFile(hFind, &FindFileData));FindClose(hFind);}
private:vector<string> suffixs_;                        // 需要过滤的后缀vector<string> file_list_;                        // 存储所有需要建立倒排的文件路径unordered_map<string, InvertList> invert_map_;  // 存储词典和倒排列表
};int main() {InvertIndex index;index.add_suffix(".py");index.set_search_path("C:\\files");while (true) {char buff[128] = { 0 };cout << "搜索内容：";cin.getline(buff, 128);index.query(buff);}return 0;
}