ISCC - Utilities#

`iscc_nph_similarity(a, b)` #

Calculate Normalized Prefix Hamming Similarity (NPHS) between two byte strings.

NPHS is defined as 1.0 minus the Normalized Prefix Hamming Distance (NPHD). It represents the fraction of matching bits within the common prefix.

Parameters:

Name	Type	Description	Default
`a`		First byte string	required
`b`		Second byte string	required

Returns:

Type	Description
	Dictionary with NPHS score and common prefix length {"similarity": float, "common_prefix_bits": int}

Source code in iscc_core\utils.py

def iscc_nph_similarity(a, b):
    # type: (bytes, bytes) -> dict
    """
    Calculate Normalized Prefix Hamming Similarity (NPHS) between two byte strings.

    NPHS is defined as 1.0 minus the Normalized Prefix Hamming Distance (NPHD).
    It represents the fraction of matching bits within the common prefix.

    :param a: First byte string
    :param b: Second byte string
    :return: Dictionary with NPHS score and common prefix length
             {"similarity": float, "common_prefix_bits": int}
    """
    common_bytes = min(len(a), len(b))
    common_bits = common_bytes * 8
    if common_bits == 0:
        return {
            "similarity": 1.0 if (len(a) == 0 and len(b) == 0) else 0.0,
            "common_prefix_bits": 0,
        }
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a[:common_bytes])
    bb.frombytes(b[:common_bytes])
    hd = count_xor(ba, bb)
    return {"similarity": 1.0 - (hd / common_bits), "common_prefix_bits": common_bits}

`iscc_nph_distance(a, b)` #

Calculate Normalized Prefix Hamming Distance (NPHD) between two byte strings.

NPHD is defined as the Hamming distance of their common prefix, normalized by the length of that common prefix in bits.

Parameters:

Name	Type	Description	Default
`a`		First byte string	required
`b`		Second byte string	required

Returns:

Type	Description
	Dictionary with NPHD score and common prefix length {"distance": float, "common_prefix_bits": int}

Source code in iscc_core\utils.py

def iscc_nph_distance(a, b):
    # type: (bytes, bytes) -> dict
    """
    Calculate Normalized Prefix Hamming Distance (NPHD) between two byte strings.

    NPHD is defined as the Hamming distance of their common prefix, normalized by
    the length of that common prefix in bits.

    :param a: First byte string
    :param b: Second byte string
    :return: Dictionary with NPHD score and common prefix length
             {"distance": float, "common_prefix_bits": int}
    """
    common_bytes = min(len(a), len(b))
    common_bits = common_bytes * 8
    if common_bits == 0:
        return {"distance": 0.0 if (len(a) == 0 and len(b) == 0) else 1.0, "common_prefix_bits": 0}
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a[:common_bytes])
    bb.frombytes(b[:common_bytes])
    hd = count_xor(ba, bb)
    return {"distance": hd / common_bits, "common_prefix_bits": common_bits}

`json_canonical(obj)` #

Canonical, deterministic serialization of ISCC metadata.

We serialize ISCC metadata in a deterministic/reproducible manner by using JCS (RFC 8785) canonicalization.

Source code in iscc_core\utils.py

def json_canonical(obj):
    # type: (Any) -> bytes
    """
    Canonical, deterministic serialization of ISCC metadata.

    We serialize ISCC metadata in a deterministic/reproducible manner by using
    [JCS (RFC 8785)](https://datatracker.ietf.org/doc/html/rfc8785) canonicalization.
    """
    ser = jcs.canonicalize(obj)
    des = json.loads(ser)
    if des != obj:
        raise ValueError(f"Not canonicalizable {obj} round-trips to {des}")
    return ser

`sliding_window(seq, width)` #

Generate a sequence of equal "width" slices each advancing by one elemnt.

All types that have a length and can be sliced are supported (list, tuple, str ...). The result type matches the type of the input sequence. Fragment slices smaller than the width at the end of the sequence are not produced. If "witdh" is smaller than the input sequence than one element will be returned that is shorter than the requested width.

Parameters:

Name	Type	Description	Default
`seq`	`Sequence`	Sequence of values to slide over	required
`width`	`int`	Width of sliding window in number of items	required

Returns:

Type	Description
`Generator`	A generator of window sized items

Source code in iscc_core\utils.py

def sliding_window(seq, width):
    # type: (Sequence, int) -> Generator
    """
    Generate a sequence of equal "width" slices each advancing by one elemnt.

    All types that have a length and can be sliced are supported (list, tuple, str ...).
    The result type matches the type of the input sequence.
    Fragment slices smaller than the width at the end of the sequence are not produced.
    If "witdh" is smaller than the input sequence than one element will be returned that
    is shorter than the requested width.

    :param Sequence seq: Sequence of values to slide over
    :param int width: Width of sliding window in number of items
    :returns: A generator of window sized items
    :rtype: Generator
    """
    if width < 2:
        raise AssertionError("Sliding window width must be 2 or bigger.")
    idx = range(max(len(seq) - width + 1, 1))
    return (seq[i : i + width] for i in idx)

`iscc_compare(a, b)` #

Calculate separate hamming distances of compatible components of two ISCCs. For ISCC-IDv1, returns a simple match comparison result.

Returns:

Type	Description
`dict`	A dict with component distances or match result for ISCC-IDv1

Source code in iscc_core\utils.py

def iscc_compare(a, b):
    # type: (str, str) -> dict
    """
    Calculate separate hamming distances of compatible components of two ISCCs.
    For ISCC-IDv1, returns a simple match comparison result.

    :return: A dict with component distances or match result for ISCC-IDv1
    :rtype: dict
    """
    ac = [ic.Code(unit) for unit in ic.iscc_decompose(a)]
    bc = [ic.Code(unit) for unit in ic.iscc_decompose(b)]

    # Special handling for ISCC-IDv1
    for code in ac + bc:
        if code.maintype == ic.MT.ID and code.version == ic.VS.V1:
            return {"id_match": code.hash_bytes == code.hash_bytes}

    result = {}
    for ca in ac:
        for cb in bc:
            cat = (ca.maintype, ca.subtype, ca.version)
            cbt = (cb.maintype, cb.subtype, ca.version)
            if cat == cbt:
                if ca.maintype != ic.MT.INSTANCE:
                    result[ca.maintype.name.lower() + "_dist"] = iscc_distance_bytes(
                        ca.hash_bytes, cb.hash_bytes
                    )
                else:
                    result["instance_match"] = ca.hash_bytes == cb.hash_bytes
    return result

`iscc_similarity(a, b)` #

Calculate similarity of ISCC codes as a percentage value (0-100).

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name	Type	Description	Default
`a`		ISCC a	required
`b`		ISCC b	required

Returns:

Type	Description
`int`	Similarity of ISCC a and b in percent (based on hamming distance)

Source code in iscc_core\utils.py

def iscc_similarity(a, b):
    # type: (str, str) -> int
    """
    Calculate similarity of ISCC codes as a percentage value (0-100).

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Similarity of ISCC a and b in percent (based on hamming distance)
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    hdist = iscc_distance_bytes(a, b)
    nbits = len(a) * 8
    sim = int(((nbits - hdist) / nbits) * 100)
    return sim

`iscc_distance(a, b)` #

Calculate hamming distance of ISCC codes.

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name	Type	Description	Default
`a`		ISCC a	required
`b`		ISCC b	required

Returns:

Type	Description
`int`	Hamming distanced in number of bits.

Source code in iscc_core\utils.py

def iscc_distance(a, b):
    # type: (str, str) -> int
    """
    Calculate hamming distance of ISCC codes.

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Hamming distanced in number of bits.
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    return iscc_distance_bytes(a, b)

`iscc_distance_bytes(a, b)` #

Calculate hamming distance for binary hash digests of equal length.

Parameters:

Name	Type	Description	Default
`a`	`bytes`	binary hash digest	required
`b`	`bytes`	binary hash digest	required

Returns:

Type	Description
`int`	Hamming distance in number of bits.

Source code in iscc_core\utils.py

def iscc_distance_bytes(a, b):
    # type: (bytes, bytes) -> int
    """
    Calculate hamming distance for binary hash digests of equal length.

    :param bytes a: binary hash digest
    :param bytes b: binary hash digest
    :return: Hamming distance in number of bits.
    :rtype: int
    """
    if len(a) != len(b):
        raise AssertionError(f"Hash diggest of unequal length: {len(a)} vs {len(b)}")
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a)
    bb.frombytes(b)
    return count_xor(ba, bb)

`iscc_pair_unpack(a, b)` #

Unpack two ISCC codes and return their body hash digests if their headers match.

Headers match if their MainType, SubType, and Version are identical.

Parameters:

Name	Type	Description	Default
`a`		ISCC a	required
`b`		ISCC b	required

Returns:

Type	Description
`Tuple[bytes, bytes]`	Tuple with hash digests of a and b

Raises:

Type	Description
`ValueError`	If ISCC headers don´t match or for unsupported types

Source code in iscc_core\utils.py

def iscc_pair_unpack(a, b):
    # type: (str, str) -> Tuple[bytes, bytes]
    """
    Unpack two ISCC codes and return their body hash digests if their headers match.

    Headers match if their MainType, SubType, and Version are identical.

    :param a: ISCC a
    :param b: ISCC b
    :return: Tuple with hash digests of a and b
    :rtype: Tuple[bytes, bytes]
    :raise ValueError: If ISCC headers don´t match or for unsupported types
    """
    a, b = ic.iscc_clean(ic.iscc_normalize(a)), ic.iscc_clean(ic.iscc_normalize(b))
    a, b = ic.decode_base32(a), ic.decode_base32(b)
    a, b = ic.decode_header(a), ic.decode_header(b)

    # Check for ISCC-IDv1 which doesn't support similarity comparison
    if a[0] == ic.MT.ID and a[2] == ic.VS.V1:
        raise ValueError("Similarity comparison not supported for ISCC-IDv1")
    if b[0] == ic.MT.ID and b[2] == ic.VS.V1:
        raise ValueError("Similarity comparison not supported for ISCC-IDv1")

    if not a[:-1] == b[:-1]:
        raise ValueError(f"ISCC headers don´t match: {a}, {b}")
    return a[-1], b[-1]

ISCC - Utilities#

iscc_nph_similarity(a, b) #

iscc_nph_distance(a, b) #

json_canonical(obj) #

sliding_window(seq, width) #

iscc_compare(a, b) #

iscc_similarity(a, b) #

iscc_distance(a, b) #

iscc_distance_bytes(a, b) #

iscc_pair_unpack(a, b) #

`iscc_nph_similarity(a, b)` #

`iscc_nph_distance(a, b)` #

`json_canonical(obj)` #

`sliding_window(seq, width)` #

`iscc_compare(a, b)` #

`iscc_similarity(a, b)` #

`iscc_distance(a, b)` #

`iscc_distance_bytes(a, b)` #

`iscc_pair_unpack(a, b)` #