Bloom filters in Python

Bloom filters in Python - 张沈鹏,在路上... - ITeye技术网站

Bloom filters in Python

博客分类:Python

PythonCC++C#算法

Python语言: 临时自用代码@代码发芽网

#coding:utf-8

# Bloom filters in Python

# Adam Langley <agl@imperialviolet.org>

# 给CountedBloom加了一个max_count 张沈鹏 <zsp007@gmail.com>

# Bloom-Filter算法简介

# http://www.googlechinablog.com/2007/07/bloom-filter.html

# http://zh.wikipedia.org/wiki/%E5%B8%83%E9%9A%86%E8%BF%87%E6%BB%A4%E5%99%A8

# 这个计算器可以帮你求最佳的参数

# http://www.cc.gatech.edu/~manolios/bloom-filters/calculator.html

# CountedBloom 的 buckets 参数对应于计算器的m,也就是"m denotes the number of bits in the Bloom filter"

import array

import struct

mixarray = array.array ('B', '\x00' * 256)

# The mixarray is based on RC4 and is used as diffusion in the hashing function

def mixarray_init (mixarray):

for i in range (256):

mixarray[i] = i

k = 7

for j in range (4):

for i in range (256):

s = mixarray[i]

k = (k + s) % 256

mixarray[i] = mixarray[k]

mixarray[k] = s

mixarray_init (mixarray)

class Bloom (object):

'''Bloom filters provide a fast and compact way of checking set membership. They do this by introducing a risk of a

false positive (but there are no false negatives).

For more information see http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html'''

def __init__ (self, bytes, hashes, data = None):

'''@bytes is the size of the bloom filter in 8-bit bytes and @hashes is the number of hash functions to use. Consult the

web page linked above for values to use. If in doubt, bytes = num_elements and hashes = 4'''

self.hashes = hashes

self.bytes = bytes

if data == None:

self.a = self._make_array (bytes)

else:

assert len (data) == bytes

self.a = data

def init_from_counted (self, cnt):

'''Set the contents of this filter from the contents of the counted filter @cnt. You have to match sizes'''

if self.bytes * 8 != (len (cnt.a) * 2):

raise ValueError ('Filters are not the same size')

for i in xrange (len (cnt.a)):

b = cnt.a[i]

b1 = (b & 0xf0) >> 4

b2 = (b & 0x0f)

if b1:

self.a[(i * 2) // 8] |= self.bitmask[(i * 2) % 8]

if b2:

self.a[(i * 2 + 1) // 8] |= self.bitmask[(i * 2 + 1) % 8]

def _make_array (self, size):

a = array.array ('B')

# stupidly, there's no good way that I can see of resizing an array without allocing a huge string to do so

# thus I use this, slightly odd, method:

blocklen = 256

arrayblock = array.array ('B', '\x00' * blocklen)

todo = size

while (todo >= blocklen):

a.extend (arrayblock)

todo -= blocklen

if todo:

a.extend (array.array ('B', '\x00' * todo))

# now a is of the right length

return a

def _hashfunc (self, n, val):

'''Apply the nth hash function'''

global mixarray

b = [ord(x) for x in struct.pack ('I', val)]

c = array.array ('B', [0, 0, 0, 0])

for i in range (4):

c[i] = mixarray[(b[i] + n) % 256]

return struct.unpack ('I', c.tostring())[0]

bitmask = [0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01]

def insert (self, val):

for i in range (self.hashes):

n = self._hashfunc (i, val) % (self.bytes *

self.a[n // 8] |= self.bitmask[n % 8]

def __contains__ (self, val):

for i in range (self.hashes):

n = self._hashfunc (i, val) % (self.bytes *

if not self.a[n // 8] & self.bitmask[n % 8]:

return 0

return 1

MAX_COUNT = 15

class CountedBloom (Bloom):

'''Just like a Bloom filter, but provides counting (e.g. you can delete as well). This uses 4 bits per bucket, so is

generally four times larger than the same non-counted bloom filter.'''

def __init__ (self, buckets, hashes):

'''Please note that @buckets must be even. Also note that with a Bloom object you give the number of *bytes* and each byte is 8 buckets. Here you're giving the number of buckets.'''

assert buckets % 2 == 0

self.hashes = hashes

self.buckets = buckets

self.a = self._make_array (buckets // 2)

def insert (self, val):

masks  = [(0x0f, 0xf0), (0xf0, 0x0f)]

shifts = [4, 0           ]

for i in range (self.hashes):

n = self._hashfunc (i, val) % self.buckets

byte         = n // 2

bucket = n % 2

(notmask, mask) = masks[bucket]

shift        = shifts[bucket]

bval         = ((self.a[byte] & mask) >> shift)

if bval < MAX_COUNT: # we shouldn't increment it if it's at the maximum

bval += 1

self.a[byte] = (self.a[byte] & notmask) | (bval << shift)

def __contains__ (self, val):

masks        = [(0x0f, 0xf0), (0xf0, 0x0f)]

shifts = [4, 0]

for i in range (self.hashes):

n = self._hashfunc (i, val) % self.buckets

byte         = n // 2

bucket = n % 2

(notmask, mask) = masks[bucket]

shift        = shifts[bucket]

bval          = ((self.a[byte] & mask) >> shift)

if bval == 0:

return 0

return 1

def max_count(self, val):

masks        = [(0x0f, 0xf0), (0xf0, 0x0f)]

shifts = [4, 0]

count_val = MAX_COUNT

for i in range (self.hashes):

n = self._hashfunc (i, val) % self.buckets

byte         = n // 2

bucket = n % 2

(notmask, mask) = masks[bucket]

shift        = shifts[bucket]

bval          = ((self.a[byte] & mask) >> shift)

if bval < MAX_COUNT:

if bval == 0:

return 0

else:

count_val = bval

return count_val

def __delitem__ (self, val):

masks  = [(0x0f, 0xf0), (0xf0, 0x0f)]

shifts = [4, 0]

for i in range (self.hashes):

n = self._hashfunc (i, val) % self.buckets

byte         = n // 2

bucket = n % 2

(notmask, mask) = masks[bucket]

shift        = shifts[bucket]

bval          = ((self.a[byte] & mask) >> shift)

if bval < MAX_COUNT: # we shouldn't decrement it if it's at the maximum

bval -= 1

self.a[byte] = (self.a[byte] & notmask) | (bval << shift)

__all__ = ['Bloom']

if __name__ == '__main__':

print 'Testing bloom filter: there should be no assertion failures'

a = Bloom (3, 4)

a.insert (45)

print a.a

a.insert (17)

print a.a

a.insert (12)

print a.a

assert 45 in a

assert 45 in a

assert not 33 in a

assert 45 in a

assert 17 in a

assert 12 in a

c = 0

for x in range (255):

if x in a:

c += 1

print c

print float(c)/255

a = CountedBloom (24, 4)

a.insert (45)

print a.a

a.insert (17)

print a.a

a.insert (12)

a.insert (12)

print "a.max_count(12)", a.max_count(12)

a.insert ("张沈鹏")

a.insert ("张沈鹏")

a.insert ("张沈鹏")

print "a.max_count(zsp)", a.max_count(12)

print a.a

assert 45 in a

assert 45 in a

assert not 33 in a

assert 45 in a

assert 17 in a

assert 12 in a

c = 0

for x in range (255):

if x in a:

c += 1

print c

print float(c)/255

del a[45]

assert not 45 in a

a2 = Bloom (3, 4)

a2.init_from_counted (a)

print a2.a

assert 17 in a2

assert 12 in a2

assert not 45 in a

posted on 2012-03-16 02:01 lexus 阅读(...) 评论(...) 编辑 收藏

转载于:https://www.cnblogs.com/lexus/archive/2012/03/16/2399593.html

Bloom filters in Python相关推荐

  1. Cuckoo Filters and Bloom Filters: Comparison and Application to Packet Classification论文总结

    Cuckoo Filters and Bloom Filters: Comparison and Application to Packet Classification论文总结 Abstract I. ...

  2. php bloomfilter,【Bloom filter】Python实现Bloom filter

    hash table具有查找高速的特点,bloom filter在此基础上,解决了内存耗大的问题,代价就是不考虑100%的准确率,存在一定的错误率(可以接受程度) 3.Python实现以及使用 代码实 ...

  3. Bloom Filter 大规模数据处理利器

    2019独角兽企业重金招聘Python工程师标准>>> 最近工作中涉及到bloom Filter,真是一把科研利器呀,大数据.网络.云等等都可以用到! Bloom Filter是由B ...

  4. python爬虫软件-从零开始写Python爬虫,四大工具你值得拥有!

    如果你正在学习编程,那么"爬虫"绝对是你不可忽视的.那么,学习python爬虫之前需要哪些准备? 一颗热爱学习,不屈不挠的心 一台有键盘的电脑(什么系统都行.我用的os x,所以例 ...

  5. python如何爬虫-如何入门 Python 爬虫?

    "入门"是良好的动机,但是可能作用缓慢.如果你手里或者脑子里有一个项目,那么实践起来你会被目标驱动,而不会像学习模块一样慢慢学习. 另外如果说知识体系里的每一个知识点是图里的点,依 ...

  6. python网络爬虫教程-如何入门 Python 爬虫?

    "入门"是良好的动机,但是可能作用缓慢.如果你手里或者脑子里有一个项目,那么实践起来你会被目标驱动,而不会像学习模块一样慢慢学习. 另外如果说知识体系里的每一个知识点是图里的点,依 ...

  7. python爬虫赚钱途径-终于领会python爬虫赚钱的途径

    python爬虫怎么赚钱?python爬虫爬到有价值的数据,进行建模,挖掘就会产生商业价值,下面是小编为您整理的关于python爬虫赚钱的途径,希望对你有所帮助. python爬虫赚钱的途径 Pyth ...

  8. python爬虫原理-干货|如何入门 Python 爬虫?爬虫原理及过程详解

    前言 Python现在非常火,语法简单而且功能强大,很多同学都想学Python!所以小的给各位看官们准备了高价值Python学习视频教程及相关电子版书籍,欢迎前来领取! "入门"是 ...

  9. 如何入门 Python 爬虫?

    "入门"是良好的动机,但是可能作用缓慢.如果你手里或者脑子里有一个项目,那么实践起来你会被目标驱动,而不会像学习模块一样慢慢学习. 另外如果说知识体系里的每一个知识点是图里的点,依 ...

最新文章

  1. 计算机应用基础的答案2015,2015年《计算机应用基础》模拟试题及答案(一)
  2. Cut the Sequence(POJ3017)
  3. 分布式系统与消息的投递
  4. Libra教程之:执行Transactions
  5. CentOS 6.4利用xampp安装bugfree3
  6. leetcode面试题 08.03. 魔术索引(二分)
  7. 工作399-openType=“getUserInfo“ lang=“zh_CN“ bindgetuserinfo=“getUserInfo“
  8. Centos6.4下安装mysql5.6.10
  9. JavaScript操作文件
  10. 【基础】弹出框的处理(五)
  11. JAVA POI读取Excel中Cell为null的处理
  12. ImageNet ILSVRC2012数据集(分类部分)简要介绍和初步处理
  13. 英特尔cpu发布时间表_英特尔公司宣布第九代桌面CPU发售时间:i9 9900K将于10月19日正式发售!...
  14. matlab初值随机扰动,GRAPES区域集合预报系统模式不确定性的随机扰动技术研究
  15. 陈天桥的大脑在孕育什么新传奇(转)
  16. 一. 英语语法 - 简单句
  17. linux 排查cpu负载过高原因
  18. 控制与决策latex排版解答
  19. Servlet 02
  20. 获取微信用户信息方案(测试)

热门文章

  1. c语言开发深圳,2020年深圳杯C题
  2. 已面世两年 大疆精灵4 Pro V2.0为什么仍是最受欢迎的无人机之一?
  3. 网课-文献管理与信息分析-罗昭峰作业答案(二)
  4. ups计算软件_什么是UPS?为什么要用UPS?
  5. 基于VUE + Echarts 实现可视化数据大屏展示效果
  6. 常用元器件封装的命名规范-001
  7. 深入研究vue还是再学react,vue 和 react 哪个前景好
  8. MATLB|基于燃料电池混合动力汽车双层凸优化
  9. 燃料电池汽车(FCV)动力传动系统的多域仿真
  10. 使用 Freeline 纪录篇