学习hashtable，处理“海量”数据

直接上代码吧，哈希表的逻辑还是很简单的，目的是对比这几种方法的速度，重要的是参照代码，看输出结果：

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <sys/timeb.h>
  4 #include <fstream>
  5 #include <string>
  6 #include <vector>
  7 #include <list>
  8 #include <algorithm>
  9 #include <set>
 10 #include <hash_set>
 11 #include <hash_map>
 12
 13 using namespace std;
 14
 15 static const int hashtable_length    = 49157;
 16
 17 // 用于定位一个Bucket
 18 unsigned int hash_function(const char* str)
 19 {
 20     const char* end_of_str = str+strlen(str);
 21     unsigned int sum = 0;
 22     while (end_of_str - str > 3)
 23     {
 24         sum = (sum + (unsigned int)*((unsigned int*)str))%hashtable_length;
 25         str += 4;
 26     }
 27     char tmp[4] = {0};
 28     strcpy(tmp, str);
 29     sum = (sum + (unsigned int)*((unsigned int*)tmp))%hashtable_length;
 30     memset(tmp, 0, 4);
 31
 32     return sum;
 33 }
 34
 35 // 用于在一个Buchet中查找目标
 36 bool find_in_bucket(list<string>& l, const char* str)
 37 {
 38     list<string>::iterator iter;
 39     unsigned int hash_key = hash_function(str);
 40     bool exist = false;
 41     for (iter = l.begin(); iter != l.end(); iter++)
 42         if (strcmp(str, iter->c_str()) == 0)
 43                 return true;
 44     return false;
 45 }
 46
 47 // 用于把目标放到Bucket中
 48 int insert_in_bucket(list<string>& l, const char* str)
 49 {
 50     if (!find_in_bucket(l, str))
 51     {
 52         l.push_back(string(str));
 53         return l.size();
 54     }else
 55         return -1;
 56 }
 57
 58 // 用于在整个hash表中查找目标
 59 bool find_in_hashtable(vector<list<string>>& v, const char* str)
 60 {
 61     return find_in_bucket(v[hash_function(str)], str);
 62 }
 63
 64 // 用于在整个hash表中插入一个元素
 65 int insert_in_hashtable(vector<list<string>>& v, const char* str)
 66 {
 67     return insert_in_bucket(v[hash_function(str)], str);
 68 }
 69
 70 // 过滤掉文本中的标点符号
 71 void filter(char* str)
 72 {
 73     while(*str++)
 74         if(*str == ',' || *str == '.'
 75             || *str == '?' || *str == '-'
 76             || *str == '\"' || *str == '\''
 77             || *str == ')' || *str == '('
 78             || *str == '!')
 79             *str = ' ';
 80 }
 81
 82 // 读取一行中的一个单词
 83 char* get_word_from_buff(char* &buff, char* word)
 84 {
 85     while (*buff && *buff == ' ')
 86         buff++;
 87     if (!*buff)
 88         return NULL;
 89     int cnt = 0;
 90     while (*buff && *buff != ' ')
 91         word[cnt++] = *buff++;
 92     word[cnt] = 0;
 93     return buff;
 94 }
 95
 96 int main()
 97 {
 98     // 对比哈希表和朴素方法的差别
 99     // 任务是存储一个文件中的英文单词，要求不能重复
100
101     timeb time_begin;
102     timeb time_end;
103     ifstream input_file;
104     input_file.open("D:\\input.txt");
105     char buff[10241] = {0};    // 10KB的缓冲区
106     char word[100];
107     vector<string> vector_of_words;
108     ftime(&time_begin);
109     // 下面代码速度奇慢无比
110     while (input_file.getline(buff, 10240))
111     {
112         filter(buff);
113         char* ptr_to_buff = buff;
114         vector<string>::iterator iter = vector_of_words.begin();
115         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
116         {
117             int i = 0;
118             for (; i < vector_of_words.size(); i++)
119                 if(strcmp(vector_of_words[i].c_str(), word) == 0)
120                     break;
121             if (i >= vector_of_words.size())
122                 vector_of_words.push_back(string(word));
123         }
124     }
125
126     ftime(&time_end);
127     unsigned int seconds = time_end.time - time_begin.time;
128     unsigned int miseconds = time_end.millitm - time_begin.millitm;
129     miseconds = seconds * 1000 + miseconds;
130     printf("朴素的方法：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, vector_of_words.size());
131
132     input_file.close();
133     input_file.open("D:\\input.txt");
134     vector<list<string>> hashtable_of_words(hashtable_length, list<string>());
135     ftime(&time_begin);
136     int count = 0;
137     while (input_file.getline(buff, 10240))
138     {
139         filter(buff);
140         char* ptr_to_buff = buff;
141         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
142             if(insert_in_hashtable(hashtable_of_words, word) != -1)
143                 ++count;
144     }
145     ftime(&time_end);
146     seconds = time_end.time - time_begin.time;
147     miseconds = time_end.millitm - time_begin.millitm;
148     miseconds = seconds * 1000 + miseconds;
149     printf("hashtable：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, count);
150
151     input_file.close();
152     input_file.open("D:\\input.txt");
153     set<string> set_of_words;
154     ftime(&time_begin);
155     while (input_file.getline(buff, 10240))
156     {
157         filter(buff);
158         char* ptr_to_buff = buff;
159         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
160             set_of_words.insert(string(word));
161     }
162     ftime(&time_end);
163     seconds = time_end.time - time_begin.time;
164     miseconds = time_end.millitm - time_begin.millitm;
165     miseconds = seconds * 1000 + miseconds;
166     printf("rbtree-set：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, set_of_words.size());
167
168     input_file.close();
169     input_file.open("D:\\input.txt");
170     hash_map<string, int> hashmap_of_words;
171     ftime(&time_begin);
172     while (input_file.getline(buff, 10240))
173     {
174         filter(buff);
175         char* ptr_to_buff = buff;
176         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
177             hashmap_of_words[string(word)]++;
178     }
179     ftime(&time_end);
180     seconds = time_end.time - time_begin.time;
181     miseconds = time_end.millitm - time_begin.millitm;
182     miseconds = seconds * 1000 + miseconds;
183     printf("hash_map：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashmap_of_words.size());
184
185     input_file.close();
186     input_file.open("D:\\input.txt");
187     hash_set<string> hashset_of_words;
188     ftime(&time_begin);
189 #if 0    // 下面代码速度奇慢无比，所以注释掉了实际没有执行，我等了半天，没有计算完，不知道是不是逻辑有问题～
190     while (input_file.getline(buff, 10240))
191     {
192         filter(buff);
193         char* ptr_to_buff = buff;
194         while (ptr_to_buff = get_word_from_buff(ptr_to_buff, word))
195             hashset_of_words.insert(string(word));
196     }
197 #endif
198     ftime(&time_end);
199     seconds = time_end.time - time_begin.time;
200     miseconds = time_end.millitm - time_begin.millitm;
201     miseconds = seconds * 1000 + miseconds;
202     printf("hash_set：\t处理时间为：\t%u\t毫秒, 统计了%d个单词\n", miseconds, hashset_of_words.size());
203
204     system("pause");
205     return 0;
206 }

输出结果：

朴素的方法：     处理时间为：    238594  毫秒, 统计了28661个单词
hashtable：     处理时间为：    2312    毫秒, 统计了28661个单词
rbtree-set：    处理时间为：    13438   毫秒, 统计了28661个单词
hash_map：      处理时间为：    6953    毫秒, 统计了28661个单词
hash_set：      处理时间为：    0       毫秒, 统计了0个单词
请按任意键继续. . .

后来又做了小幅的改动：

 1 unsigned int hash_function_opt(const char* str)
 2 {
 3     const char* end_of_str = str+strlen(str);
 4     unsigned int sum = 0;
 5     while (end_of_str - str > 3)
 6     {
 7         sum ^= *((unsigned int*)str);
 8         str += 4;
 9     }
10     char tmp[4] = {0};
11     strcpy(tmp, str);
12     sum ^= (unsigned int)*((unsigned int*)tmp);
13     sum %= hashtable_length;
14     memset(tmp, 0, 4);
15
16     return sum;
17 }

1 bool find_in_bucket_opt(list<string>& l, const char* str)
2 {
3     list<string>::iterator iter;
4     for (iter = l.begin(); iter != l.end(); iter++)
5         if (strcmp(str, iter->c_str()) == 0)
6             return true;
7     return false;
8 }

1 void filter(char* str)
2 {
3     while(*str)
4     {
5         if(!((*str >= 'a' && *str <= 'z') || (*str >= 'A' && *str <= 'Z')))
6             *str = ' ';
7         str++;
8     }
9 }

 1     ofstream output_file;
 2     output_file.open("D:\\output.txt");
 3     vector<string> all_words;
 4     for(vector<list<string>>::iterator i_v = hashtable_of_words.begin(); i_v != hashtable_of_words.end(); i_v++)
 5         for(list<string>::iterator i_l = i_v->begin(); i_l != i_v->end(); i_l++)
 6             all_words.push_back(*i_l);
 7     sort(all_words.begin(), all_words.end());
 8     for(vector<string>::iterator i_v = all_words.begin(); i_v != all_words.end(); i_v++)
 9         output_file << *i_v <<endl;
10     output_file.close();

改动之后发现，之前的版本其实是有些小错误的，新的输出结果为：

朴素的方法：    处理时间为：    0       毫秒, 统计了0个单词
hashtable：     处理时间为：    2079    毫秒, 统计了27735个单词
hashtable2：    处理时间为：    2047    毫秒, 统计了27735个单词
rbtree-set：    处理时间为：    0       毫秒, 统计了0个单词
hash_map：      处理时间为：    0       毫秒, 统计了0个单词
hash_set：      处理时间为：    0       毫秒, 统计了0个单词
请按任意键继续. . .

转载于:https://www.cnblogs.com/zanzan101/p/3334136.html

学习hashtable，处理“海量”数据相关推荐

【深度学习】基于 Alluxio 数据缓存的性能优化
作者 | 车漾(阿里云高级技术专家).顾荣(南京大学副研究员) 导读:Alluxio 项目诞生于 UC Berkeley AMP 实验室,自开源以来经过 7 年的不断开发迭代,支撑大数据处理场景的数 ...
《大数据》2015年第3期“网络大数据专题”——基于特征学习的文本大数据内容理解及其发展趋势...
基于特征学习的文本大数据内容理解及其发展趋势袁书寒,向阳,鄂世嘉 (同济大学计算机科学与技术系上海 201804) 摘要:大数据中蕴含着重要的价值信息,文本大数据作为大数据的重要组成部分,是人类 ...
如何判断你的数据集是否适合使用深度学习模型？如果数据量太小有什么解决办法？
如何判断你的数据集是否适合使用深度学习模型?如果数据量太小有什么解决办法? deep learning is a data hungry problem 数据集太小,数据样本不足时,深度学习相对其它机 ...
深度学习遇上稀缺数据就无计可施？这里有几个好办法
2019-12-07 05:30:39 作者 | Tyler Folkman 编译 | 杨晓凡对于深度学习而言,在有很多数据的情况下,再复杂的问题也不在话下,然而没有这么多数据呢?本文作者 Tyle ...
python学习音频-Python 音频数据扩充的技巧
经典的深度学习网络AlexNet使用数据扩充(Data Augmentation)的方式扩大数据集,取得较好的分类效果.在深度学习的图像领域中,通过平移. 翻转.加噪等方法进行数据扩充.但是,在音频( ...
Caffe学习系列(13)：数据可视化环境（python接口)配置
原文有更新: Caffe学习系列(13):数据可视化环境(python接口)配置 - denny402 - 博客园 http://www.cnblogs.com/denny402/p/5088399. ...
oracle数据库开多线程,学习笔记:Oracle表数据导入 DBA常用单线程插入多线程插入 sql loader三种表数据导入案例...
天萃荷净 oracle之数据导入,汇总开发DBA在向表中导入大量数据的案例,如:单线程向数据库中插入数据,多线程向数据表中插入数据,使用sql loader数据表中导入数据案例 1.Oracle数据库 ...
Vue学习笔记入门篇——数据及DOM
本文为转载,原文:Vue学习笔记入门篇--数据及DOM 数据 data 类型 Object | Function 详细 Vue 实例的数据对象.Vue 将会递归将 data 的属性转换为 getter ...
结合深度学习的工业大数据应用研究
结合深度学习的工业大数据应用研究李广杨欣电子科技大学大数据研究中心,四川成都 611731 成都数之联科技有限公司,四川成都 610041 摘要:如何将大数据等核心技术与智能制造结合, ...
vs2010 学习Silverlight学习笔记(11)：数据与通信之WebClient
概要: 基础知识终于学完了,我今天又从第一篇看到第十篇,发现明白了一些东西,还有忘记了部分东西.呵呵,咱不能猴子掰玉米,学了新的忘记旧的.要经常去复习,去用.这一篇是数据通信部分的第一篇,有些东西没接 ...

学习hashtable，处理“海量”数据

学习hashtable，处理“海量”数据相关推荐

最新文章

热门文章