Skip to content

ISCC - Minhash#

alg_minhash(features) #

Calculate a 64 dimensional minhash integer vector.

Parameters:

Name Type Description Default
features List[int]

List of integer features

required

Returns:

Type Description
List[int]

Minhash vector

Source code in iscc_core\minhash.py
def alg_minhash(features):
    # type: (List[int]) -> List[int]
    """
    Calculate a 64 dimensional minhash integer vector.

    :param List[int] features: List of integer features
    :return: Minhash vector
    :rtype: List[int]
    """
    return [
        min([(((a * f + b) & MAXI64) % MPRIME) & MAXH for f in features]) for a, b in zip(MPA, MPB)
    ]

alg_minhash_64(features) #

Create 64-bit minimum hash digest.

Parameters:

Name Type Description Default
features List[int]

List of integer features

required

Returns:

Type Description
bytes

64-bit binary from the least significant bits of the minhash values

Source code in iscc_core\minhash.py
def alg_minhash_64(features):
    # type: (List[int]) -> bytes
    """
    Create 64-bit minimum hash digest.

    :param List[int] features: List of integer features
    :return: 64-bit binary from the least significant bits of the minhash values
    :rtype: bytes
    """
    return alg_minhash_compress(alg_minhash(features), 1)

alg_minhash_256(features) #

Create 256-bit minimum hash digest.

Parameters:

Name Type Description Default
features List[int]

List of integer features

required

Returns:

Type Description
bytes

256-bit binary from the least significant bits of the minhash values

Source code in iscc_core\minhash.py
def alg_minhash_256(features):
    # type: (List[int]) -> bytes
    """
    Create 256-bit minimum hash digest.

    :param List[int] features: List of integer features
    :return: 256-bit binary from the least significant bits of the minhash values
    :rtype: bytes
    """
    return alg_minhash_compress(alg_minhash(features), 4)

alg_minhash_compress(mhash, lsb = 4) #

Compress minhash vector to byte hash-digest.

Concatenates lsb number of least-significant bits from each integer in mhash. For example an mhash with 64 integers and lsb=4 will produce a 256-bit summary of the minhash vector.

Parameters:

Name Type Description Default
mhash List[int]

List of minhash integer features

required
lsb int

Number of the least significant bits to retain

4

Returns:

Type Description
bytes

256-bit binary from the least significant bits of the minhash values

Source code in iscc_core\minhash.py
def alg_minhash_compress(mhash, lsb=4):
    # type: (List[int], int) -> bytes
    """
    Compress minhash vector to byte hash-digest.

    Concatenates `lsb` number of least-significant bits from each integer in `mhash`.
    For example an `mhash` with 64 integers and `lsb=4` will produce a 256-bit summary
    of the minhash vector.

    :param List[int] mhash: List of minhash integer features
    :param int lsb: Number of the least significant bits to retain
    :return: 256-bit binary from the least significant bits of the minhash values
    :rtype: bytes
    """
    bits: str = ""
    for bitpos in range(lsb):
        for h in mhash:
            bits += str(h >> bitpos & 1)
    return int(bits, 2).to_bytes((len(bits) + 7) // 8, "big")