Python implements an instance of the simhash algorithm

2020-04-02 13:36:11
OfStack

The algorithm of Simhash is simply to quickly search the Simhash set with a difference of less than k bits between the known Simhash and the mass text. Here, each text can be represented by a Simhash value. A Simhash has 64 bits, similar text has 64 bits, and the empirical value of k in the paper is 3. The disadvantages of this method are as obvious as the advantages, there are two main points, for the short text, k value is very sensitive; The other is that because the algorithm trades space for time, the system's memory can't handle it.

< img SRC = "border = 0 / / files.jb51.net/file_images/article/201404/2014425111958313.jpg? 201432511207 ">


#!/usr/bin/python
# coding=utf-8
class simhash:

    # The constructor 
    def __init__(self, tokens='', hashbits=128):        
        self.hashbits = hashbits
        self.hash = self.simhash(tokens);

    #toString function     
    def __str__(self):
        return str(self.hash)

    # generate simhash value     
    def simhash(self, tokens):
        v = [0] * self.hashbits
        for t in [self._string_hash(x) for x in tokens]: #t for token The ordinary hash value            
            for i in range(self.hashbits):
                bitmask = 1 << i
                if t & bitmask :
                    v[i] += 1 # View the current bit Whether a is 1, If yes, it will be the bit +1
                else:
                    v[i] -= 1 # otherwise , The bit -1
        fingerprint = 0
        for i in range(self.hashbits):
            if v[i] >= 0:
                fingerprint += 1 << i
        return fingerprint # document-wide fingerprint Is the final bits >=0 And of the 

    # Find the hamming distance 
    def hamming_distance(self, other):
        x = (self.hash ^ other.hash) & ((1 << self.hashbits) - 1)
        tot = 0;
        while x :
            tot += 1
            x &= x - 1
        return tot

    # O similarity 
    def similarity (self, other):
        a = float(self.hash)
        b = float(other.hash)
        if a > b : return b / a
        else: return a / b

    # for source generate hash value    ( A variable length version Python Built-in hash )
    def _string_hash(self, source):        
        if source == "":
            return 0
        else:
            x = ord(source[0]) << 7
            m = 1000003
            mask = 2 ** self.hashbits - 1
            for c in source:
                x = ((x * m) ^ ord(c)) & mask
            x ^= len(source)
            if x == -1:
                x = -2
            return x
             
if __name__ == '__main__':
    s = 'This is a test string for testing'
    hash1 = simhash(s.split())

    s = 'This is a test string for testing also'
    hash2 = simhash(s.split())

    s = 'nai nai ge xiong cao'
    hash3 = simhash(s.split())

    print(hash1.hamming_distance(hash2) , "   " , hash1.similarity(hash2))
    print(hash1.hamming_distance(hash3) , "   " , hash1.similarity(hash3))