Python implements an instance of the simhash algorithm
- 2020-04-02 13:36:11
- OfStack
The algorithm of Simhash is simply to quickly search the Simhash set with a difference of less than k bits between the known Simhash and the mass text. Here, each text can be represented by a Simhash value. A Simhash has 64 bits, similar text has 64 bits, and the empirical value of k in the paper is 3. The disadvantages of this method are as obvious as the advantages, there are two main points, for the short text, k value is very sensitive; The other is that because the algorithm trades space for time, the system's memory can't handle it.
< img SRC = "border = 0 / / files.jb51.net/file_images/article/201404/2014425111958313.jpg? 201432511207 ">
#!/usr/bin/python
# coding=utf-8
class simhash:
# The constructor
def __init__(self, tokens='', hashbits=128):
self.hashbits = hashbits
self.hash = self.simhash(tokens);
#toString function
def __str__(self):
return str(self.hash)
# generate simhash value
def simhash(self, tokens):
v = [0] * self.hashbits
for t in [self._string_hash(x) for x in tokens]: #t for token The ordinary hash value
for i in range(self.hashbits):
bitmask = 1 << i
if t & bitmask :
v[i] += 1 # View the current bit Whether a is 1, If yes, it will be the bit +1
else:
v[i] -= 1 # otherwise , The bit -1
fingerprint = 0
for i in range(self.hashbits):
if v[i] >= 0:
fingerprint += 1 << i
return fingerprint # document-wide fingerprint Is the final bits >=0 And of the
# Find the hamming distance
def hamming_distance(self, other):
x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
tot = 0;
while x :
tot += 1
x &= x - 1
return tot
# O similarity
def similarity (self, other):
a = float(self.hash)
b = float(other.hash)
if a > b : return b / a
else: return a / b
# for source generate hash value ( A variable length version Python Built-in hash )
def _string_hash(self, source):
if source == "":
return 0
else:
x = ord(source[0]) << 7
m = 1000003
mask = 2 ** self.hashbits - 1
for c in source:
x = ((x * m) ^ ord(c)) & mask
x ^= len(source)
if x == -1:
x = -2
return x
if __name__ == '__main__':
s = 'This is a test string for testing'
hash1 = simhash(s.split())
s = 'This is a test string for testing also'
hash2 = simhash(s.split())
s = 'nai nai ge xiong cao'
hash3 = simhash(s.split())
print(hash1.hamming_distance(hash2) , " " , hash1.similarity(hash2))
print(hash1.hamming_distance(hash3) , " " , hash1.similarity(hash3))