Skip to content

ISCC - Utilities#

json_canonical(obj) #

Canonical, deterministic serialization of ISCC metadata.

We serialize ISCC metadata in a deterministic/reproducible manner by using JCS (RFC 8785) canonicalization.

Source code in iscc_core\utils.py
def json_canonical(obj):
    # type: (Any) -> bytes
    """
    Canonical, deterministic serialization of ISCC metadata.

    We serialize ISCC metadata in a deterministic/reproducible manner by using
    [JCS (RFC 8785)](https://datatracker.ietf.org/doc/html/rfc8785) canonicalization.
    """
    ser = jcs.canonicalize(obj)
    des = json.loads(ser)
    if des != obj:
        raise ValueError(f"Not canonicalizable {obj} round-trips to {des}")
    return ser

sliding_window(seq, width) #

Generate a sequence of equal "width" slices each advancing by one elemnt.

All types that have a length and can be sliced are supported (list, tuple, str ...). The result type matches the type of the input sequence. Fragment slices smaller than the width at the end of the sequence are not produced. If "witdh" is smaller than the input sequence than one element will be returned that is shorter than the requested width.

Parameters:

Name Type Description Default
seq Sequence

Sequence of values to slide over

required
width int

Width of sliding window in number of items

required

Returns:

Type Description
Generator

A generator of window sized items

Source code in iscc_core\utils.py
def sliding_window(seq, width):
    # type: (Sequence, int) -> Generator
    """
    Generate a sequence of equal "width" slices each advancing by one elemnt.

    All types that have a length and can be sliced are supported (list, tuple, str ...).
    The result type matches the type of the input sequence.
    Fragment slices smaller than the width at the end of the sequence are not produced.
    If "witdh" is smaller than the input sequence than one element will be returned that
    is shorter than the requested width.

    :param Sequence seq: Sequence of values to slide over
    :param int width: Width of sliding window in number of items
    :returns: A generator of window sized items
    :rtype: Generator
    """
    if width < 2:
        raise AssertionError("Sliding window width must be 2 or bigger.")
    idx = range(max(len(seq) - width + 1, 1))
    return (seq[i : i + width] for i in idx)

iscc_compare(a, b) #

Calculate separate hamming distances of compatible components of two ISCCs

Returns:

Type Description
dict

A dict with keys meta_dist, semantic_dist, content_dist, data_dist, instance_match

Source code in iscc_core\utils.py
def iscc_compare(a, b):
    # type: (str, str) -> dict
    """
    Calculate separate hamming distances of compatible components of two ISCCs

    :return: A dict with keys meta_dist, semantic_dist, content_dist, data_dist, instance_match
    :rtype: dict
    """
    ac = [ic.Code(unit) for unit in ic.iscc_decompose(a)]
    bc = [ic.Code(unit) for unit in ic.iscc_decompose(b)]
    result = {}
    for ca in ac:
        for cb in bc:
            cat = (ca.maintype, ca.subtype, ca.version)
            cbt = (cb.maintype, cb.subtype, ca.version)
            if cat == cbt:
                if ca.maintype != ic.MT.INSTANCE:
                    result[ca.maintype.name.lower() + "_dist"] = iscc_distance_bytes(
                        ca.hash_bytes, cb.hash_bytes
                    )
                else:
                    result["instance_match"] = ca.hash_bytes == cb.hash_bytes
    return result

iscc_similarity(a, b) #

Calculate similarity of ISCC codes as a percentage value (0-100).

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
int

Similarity of ISCC a and b in percent (based on hamming distance)

Source code in iscc_core\utils.py
def iscc_similarity(a, b):
    # type: (str, str) -> int
    """
    Calculate similarity of ISCC codes as a percentage value (0-100).

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Similarity of ISCC a and b in percent (based on hamming distance)
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    hdist = iscc_distance_bytes(a, b)
    nbits = len(a) * 8
    sim = int(((nbits - hdist) / nbits) * 100)
    return sim

iscc_distance(a, b) #

Calculate hamming distance of ISCC codes.

MainType, SubType, Version and Length of the codes must be the same.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
int

Hamming distanced in number of bits.

Source code in iscc_core\utils.py
def iscc_distance(a, b):
    # type: (str, str) -> int
    """
    Calculate hamming distance of ISCC codes.

    MainType, SubType, Version and Length of the codes must be the same.

    :param a: ISCC a
    :param b: ISCC b
    :return: Hamming distanced in number of bits.
    :rtype: int
    """
    a, b = iscc_pair_unpack(a, b)
    return iscc_distance_bytes(a, b)

iscc_distance_bytes(a, b) #

Calculate hamming distance for binary hash digests of equal length.

Parameters:

Name Type Description Default
a bytes

binary hash digest

required
b bytes

binary hash digest

required

Returns:

Type Description
int

Hamming distance in number of bits.

Source code in iscc_core\utils.py
def iscc_distance_bytes(a, b):
    # type: (bytes, bytes) -> int
    """
    Calculate hamming distance for binary hash digests of equal length.

    :param bytes a: binary hash digest
    :param bytes b: binary hash digest
    :return: Hamming distance in number of bits.
    :rtype: int
    """
    if len(a) != len(b):
        raise AssertionError(f"Hash diggest of unequal length: {len(a)} vs {len(b)}")
    ba, bb = bitarray(), bitarray()
    ba.frombytes(a)
    bb.frombytes(b)
    return count_xor(ba, bb)

iscc_pair_unpack(a, b) #

Unpack two ISCC codes and return their body hash digests if their headers match.

Headers match if their MainType, SubType, and Version are identical.

Parameters:

Name Type Description Default
a

ISCC a

required
b

ISCC b

required

Returns:

Type Description
Tuple[bytes, bytes]

Tuple with hash digests of a and b

Raises:

Type Description
ValueError

If ISCC headers don´t match

Source code in iscc_core\utils.py
def iscc_pair_unpack(a, b):
    # type: (str, str) -> Tuple[bytes, bytes]
    """
    Unpack two ISCC codes and return their body hash digests if their headers match.

    Headers match if their MainType, SubType, and Version are identical.

    :param a: ISCC a
    :param b: ISCC b
    :return: Tuple with hash digests of a and b
    :rtype: Tuple[bytes, bytes]
    :raise ValueError: If ISCC headers don´t match
    """
    a, b = ic.iscc_clean(ic.iscc_normalize(a)), ic.iscc_clean(ic.iscc_normalize(b))
    a, b = ic.decode_base32(a), ic.decode_base32(b)
    a, b = ic.decode_header(a), ic.decode_header(b)
    if not a[:-1] == b[:-1]:
        raise ValueError(f"ISCC headers don´t match: {a}, {b}")
    return a[-1], b[-1]