Skip to content

ISCC - Utilities#

iscc_nph_similarity(a, b) #

Calculate Normalized Prefix Hamming Similarity (NPHS) between two byte strings.

NPHS is defined as 1.0 minus the Normalized Prefix Hamming Distance (NPHD). It represents the fraction of matching bits within the common prefix.

Parameters:

Name Type Description Default
a

First byte string

required
b

Second byte string

required

Returns:

Type Description

Dictionary with NPHS score and common prefix length {"similarity": float, "common_prefix_bits": int}

Source code in iscc_core\utils.py
def iscc_nph_similarity(a, b):
    # type: (bytes, bytes) -> dict
    """
    Calculate Normalized Prefix Hamming Similarity (NPHS) between two byte strings.

    NPHS is defined as 1.0 minus the Normalized Prefix Hamming Distance (NPHD).
    It represents the fraction of matching bits within the common prefix.

    :param a: First byte string
    :param b: Second byte string
    :return: Dictionary with NPHS score and common prefix length
             {"similarity": float, "common_prefix_bits": int}
    """
    common_bytes = min(len(a), len(b))
    common_bits = common_bytes * 8
    if common_bits == 0:
        return {
            "similarity": 1.0 if (len(a) == 0 and len(b) == 0) else 0.0,
            "common_prefix_bits": 0,
        }
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a[:common_bytes])
    bb.frombytes(b[:common_bytes])
    hd = count_xor(ba, bb)
    return {"similarity": 1.0 - (hd / common_bits), "common_prefix_bits": common_bits}

iscc_nph_distance(a, b) #

Calculate Normalized Prefix Hamming Distance (NPHD) between two byte strings.

NPHD is defined as the Hamming distance of their common prefix, normalized by the length of that common prefix in bits.

Parameters:

Name Type Description Default
a

First byte string

required
b

Second byte string

required

Returns:

Type Description

Dictionary with NPHD score and common prefix length {"distance": float, "common_prefix_bits": int}

Source code in iscc_core\utils.py
def iscc_nph_distance(a, b):
    # type: (bytes, bytes) -> dict
    """
    Calculate Normalized Prefix Hamming Distance (NPHD) between two byte strings.

    NPHD is defined as the Hamming distance of their common prefix, normalized by
    the length of that common prefix in bits.

    :param a: First byte string
    :param b: Second byte string
    :return: Dictionary with NPHD score and common prefix length
             {"distance": float, "common_prefix_bits": int}
    """
    common_bytes = min(len(a), len(b))
    common_bits = common_bytes * 8
    if common_bits == 0:
        return {"distance": 0.0 if (len(a) == 0 and len(b) == 0) else 1.0, "common_prefix_bits": 0}
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a[:common_bytes])
    bb.frombytes(b[:common_bytes])
    hd = count_xor(ba, bb)
    return {"distance": hd / common_bits, "common_prefix_bits": common_bits}

json_canonical(obj) #

Canonical, deterministic serialization of ISCC metadata.

We serialize ISCC metadata in a deterministic/reproducible manner by using JCS (RFC 8785) canonicalization.

Source code in iscc_core\utils.py
def json_canonical(obj):
    # type: (Any) -> bytes
    """
    Canonical, deterministic serialization of ISCC metadata.

    We serialize ISCC metadata in a deterministic/reproducible manner by using
    [JCS (RFC 8785)](https://datatracker.ietf.org/doc/html/rfc8785) canonicalization.
    """
    ser = jcs.canonicalize(obj)
    des = json.loads(ser)
    if des != obj:
        raise ValueError(f"Not canonicalizable {obj} round-trips to {des}")
    return ser

sliding_window(seq, width) #

Generate a sequence of equal "width" slices each advancing by one elemnt.

All types that have a length and can be sliced are supported (list, tuple, str ...). The result type matches the type of the input sequence. Fragment slices smaller than the width at the end of the sequence are not produced. If "witdh" is smaller than the input sequence than one element will be returned that is shorter than the requested width.

Parameters:

Name Type Description Default
seq Sequence

Sequence of values to slide over

required
width int

Width of sliding window in number of items

required

Returns:

Type Description
Generator

A generator of window sized items

Source code in iscc_core\utils.py
def sliding_window(seq, width):
    # type: (Sequence, int) -> Generator
    """
    Generate a sequence of equal "width" slices each advancing by one elemnt.

    All types that have a length and can be sliced are supported (list, tuple, str ...).
    The result type matches the type of the input sequence.
    Fragment slices smaller than the width at the end of the sequence are not produced.
    If "witdh" is smaller than the input sequence than one element will be returned that
    is shorter than the requested width.

    :param Sequence seq: Sequence of values to slide over
    :param int width: Width of sliding window in number of items
    :returns: A generator of window sized items
    :rtype: Generator
    """
    if width < 2:
        raise AssertionError("Sliding window width must be 2 or bigger.")
    idx = range(max(len(seq) - width + 1, 1))
    return (seq[i : i + width] for i in idx)

iscc_compare(a, b) #

Calculate separate hamming distances of compatible components of two ISCCs. For ISCC-IDv1, returns a simple match comparison result.

Returns:

Type Description
dict

A dict with component distances or match result for ISCC-IDv1

Source code in iscc_core\utils.py
def iscc_compare(a, b):
    # type: (str, str) -> dict
    """
    Calculate separate hamming distances of compatible components of two ISCCs.
    For ISCC-IDv1, returns a simple match comparison result.

    :return: A dict with component distances or match result for ISCC-IDv1
    :rtype: dict
    """
    ac = [ic.Code(unit) for unit in ic.iscc_decompose(a)]
    bc = [ic.Code(unit) for unit in ic.iscc_decompose(b)]

    # Special handling for ISCC-IDv1
    for code in ac + bc:
        if code.maintype == ic.MT.ID and code.version == ic.VS.V1:
            return {"id_match": code.hash_bytes == code.hash_bytes}

    result = {}
    for ca in ac:
        for cb in bc:
            cat = (ca.maintype, ca.subtype, ca.version)
            cbt = (cb.maintype, cb.subtype, ca.version)
            if cat == cbt:
                if ca.maintype != ic.MT.INSTANCE:
                    result[ca.maintype.name.lower() + "_dist"] = iscc_distance_bytes(
                        ca.hash_bytes, cb.hash_bytes
                    )
                else:
                    result["instance_match"] = ca.hash_bytes == cb.hash_bytes
    return result

iscc_similarity(a, b) #

Calculate similarity of ISCC codes as a percentage value (0-100).

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
int

Similarity of ISCC a and b in percent (based on hamming distance)

Source code in iscc_core\utils.py
def iscc_similarity(a, b):
    # type: (str, str) -> int
    """
    Calculate similarity of ISCC codes as a percentage value (0-100).

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Similarity of ISCC a and b in percent (based on hamming distance)
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    hdist = iscc_distance_bytes(a, b)
    nbits = len(a) * 8
    sim = int(((nbits - hdist) / nbits) * 100)
    return sim

iscc_distance(a, b) #

Calculate hamming distance of ISCC codes.

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
int

Hamming distanced in number of bits.

Source code in iscc_core\utils.py
def iscc_distance(a, b):
    # type: (str, str) -> int
    """
    Calculate hamming distance of ISCC codes.

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Hamming distanced in number of bits.
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    return iscc_distance_bytes(a, b)

iscc_distance_bytes(a, b) #

Calculate hamming distance for binary hash digests of equal length.

Parameters:

Name Type Description Default
a bytes

binary hash digest

required
b bytes

binary hash digest

required

Returns:

Type Description
int

Hamming distance in number of bits.

Source code in iscc_core\utils.py
def iscc_distance_bytes(a, b):
    # type: (bytes, bytes) -> int
    """
    Calculate hamming distance for binary hash digests of equal length.

    :param bytes a: binary hash digest
    :param bytes b: binary hash digest
    :return: Hamming distance in number of bits.
    :rtype: int
    """
    if len(a) != len(b):
        raise AssertionError(f"Hash diggest of unequal length: {len(a)} vs {len(b)}")
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a)
    bb.frombytes(b)
    return count_xor(ba, bb)

iscc_pair_unpack(a, b) #

Unpack two ISCC codes and return their body hash digests if their headers match.

Headers match if their MainType, SubType, and Version are identical.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
Tuple[bytes, bytes]

Tuple with hash digests of a and b

Raises:

Type Description
ValueError

If ISCC headers don´t match or for unsupported types

Source code in iscc_core\utils.py
def iscc_pair_unpack(a, b):
    # type: (str, str) -> Tuple[bytes, bytes]
    """
    Unpack two ISCC codes and return their body hash digests if their headers match.

    Headers match if their MainType, SubType, and Version are identical.

    :param a: ISCC a
    :param b: ISCC b
    :return: Tuple with hash digests of a and b
    :rtype: Tuple[bytes, bytes]
    :raise ValueError: If ISCC headers don´t match or for unsupported types
    """
    a, b = ic.iscc_clean(ic.iscc_normalize(a)), ic.iscc_clean(ic.iscc_normalize(b))
    a, b = ic.decode_base32(a), ic.decode_base32(b)
    a, b = ic.decode_header(a), ic.decode_header(b)

    # Check for ISCC-IDv1 which doesn't support similarity comparison
    if a[0] == ic.MT.ID and a[2] == ic.VS.V1:
        raise ValueError("Similarity comparison not supported for ISCC-IDv1")
    if b[0] == ic.MT.ID and b[2] == ic.VS.V1:
        raise ValueError("Similarity comparison not supported for ISCC-IDv1")

    if not a[:-1] == b[:-1]:
        raise ValueError(f"ISCC headers don´t match: {a}, {b}")
    return a[-1], b[-1]