转载自:http://blog.csdn.net/demon24/article/details/8537665
http://blog.csdn.net/u013402746/article/details/28414901 http://palydawn.blog.163.com/blog/static/182969056201210260470485/
#_*_coding:utf_8_
import BitVector
import os
import sys
class SimpleHash():
def __init__(self, capability, seed):
self.capability = capability
self.seed = seed
#传入的value即为url值,ord(value[i])表示第i位字符的ascii码值
def hash(self, value):
ret = 0
for i in range(len(value)):
ret += self.seed*ret + ord(value[i])
#最终产生的随机数是二进制向量最大下标与随机数的按位与结果
return (self.capability-1) & ret
class BloomFilter():
def __init__(self, BIT_SIZE=1<<25):
self.BIT_SIZE = 1 << 25
self.seeds = [5, 7, 11, 13, 31, 37, 61]
#建立一个大小为1<<25=33554432位的二进制向量,分配内存
self.bitset = BitVector.BitVector(size=self.BIT_SIZE)
self.hashFunc = []
#利用8个素数初始化8个随机数生成器
for i in range(len(self.seeds)):
self.hashFunc.append(SimpleHash(self.BIT_SIZE, self.seeds[i]))
def insert(self, value):
for f in self.hashFunc:
loc = f.hash(value)
self.bitset[loc] = 1
def isContaions(self, value):
if value == None:
return False
ret = True
for f in self.hashFunc:
loc = f.hash(value)
#用同样的随机数产生方法对比相应位的二进制值,只要发现有一个不同即返回结果为假
ret = ret & self.bitset[loc]
if ret==False:
return ret
#只有当8个二进制位都相等时才返回真
return ret
def main():
fd = open("urls.txt")
bloomfilter = BloomFilter()
while True:
#url = raw_input()
url = fd.readline()
if cmp(url, 'exit') == 0: #if 'exit' then break
break
if bloomfilter.isContaions(url) == False:
bloomfilter.insert(url)
else:
print 'url :%s has exist' % url
main()
程序根据网页链接的个数计算所需最佳空间大小,即二进制向量的位数,以及所需随机生成器的哈希函数个数:
def __init__(self, error_rate, elementNum):
#计算所需要的bit数
self.bit_num = -1 * elementNum * cmath.log(error_rate) / (cmath.log(2.0) * cmath.log(2.0))
#四字节对齐
self.bit_num = self.align_4byte(self.bit_num.real)
#分配内存
self.bit_array = BitVector(size=self.bit_num)
#计算hash函数个数
self.hash_num = cmath.log(2) * self.bit_num / elementNum
self.hash_num = self.hash_num.real
#向上取整
self.hash_num = int(self.hash_num) + 1
#产生hash函数种子
self.hash_seeds = self.generate_hashseeds(self.hash_num)
生成指定的前n位素数:
def is_prime(n):
if n == 9:
return False
for i in range(3,int(n**0.5)):
if n%i == 0:
return False
return True
def find_prime( n ): #找到从5开始的n个素数,使用素数筛法
prime = []
i = 5
while len(prime) != n:
flag = False
for j in prime:
if i % j == 0:
flag = True
i += 1
break
if flag: #如果能被素数整除就跳过一轮循环
continue
if is_prime(i):
prime.append(i)
i += 1
return prime