Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

titusz commited on Aug 13

Commit

b31f748

•

1 Parent(s): ce4efbf

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (29) hide show

.editorconfig +20 -0
CHANGELOG.md +15 -0
LICENSE +10 -0
iscc_sct/__init__.py +6 -0
iscc_sct/cli.py +51 -0
iscc_sct/code_semantic_text.py +338 -0
iscc_sct/demo.py +153 -0
iscc_sct/main.py +32 -0
iscc_sct/models.py +176 -0
iscc_sct/options.py +78 -0
iscc_sct/tokenizer.json +0 -0
iscc_sct/utils.py +178 -0
poetry.lock +0 -0
pyproject.toml +94 -0
space.yml +44 -0
tests/__init__.py +0 -0
tests/benchmark.py +55 -0
tests/conftest.py +15 -0
tests/de.txt +194 -0
tests/en.txt +155 -0
tests/freeze_tokenizer.py +17 -0
tests/test_cli.py +63 -0
tests/test_demo.py +80 -0
tests/test_iscc_sct.py +245 -0
tests/test_main.py +32 -0
tests/test_models.py +102 -0
tests/test_readme.py +11 -0
tests/test_utils.py +92 -0
tests/visualize.py +90 -0

.editorconfig ADDED Viewed

	@@ -0,0 +1,20 @@

+# see http://editorconfig.org
+# Top-level config
+root = true
+# All files
+[*]
+charset = utf-8
+indent_style = space
+indent_size = 4
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+max_line_length = 119
+# YAML files
+[*.{yml,yaml}]
+indent_size = 2

CHANGELOG.md ADDED Viewed

	@@ -0,0 +1,15 @@

+# Changelog
+## [0.1.2] - Unreleased
+- Encode granular features with base64
+- Refactor result format to generic ISCC data model
+- Add optional gradio GUI demo
+## [0.1.1] - 2024-06-25
+- Handle text decoding errors gracefully
+- Handle feature bit-lengths independently
+- Improve model load time
+- Improve memory use with batched embedding
+## [0.1.0] - 2024-06-25
+- Initial pre-release

LICENSE ADDED Viewed

	@@ -0,0 +1,10 @@

+# LICENSE
+This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
+To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to
+Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+## Attribution
+Titusz Pan, ISCC Foundation - 2024

iscc_sct/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+__version__ = "0.1.2"
+from iscc_sct.options import *
+from iscc_sct.utils import *
+from iscc_sct.code_semantic_text import *
+from iscc_sct.models import *
+from iscc_sct.main import *

iscc_sct/cli.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import argparse
+import glob
+from pathlib import Path
+from loguru import logger
+from iscc_sct.main import create
+from charset_normalizer import from_bytes
+def main():
+    parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
+    parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
+    parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
+    parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
+    parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
+    args = parser.parse_args()
+    if args.path is None:
+        parser.print_help()
+        return
+    if not args.debug:
+        logger.remove()
+    for path in glob.glob(args.path):
+        path = Path(path)
+        if path.is_file():
+            logger.debug(f"Processing {path.name}")
+            with path.open("rb") as file:
+                data = file.read()
+                try:
+                    text = data.decode("utf-8")
+                    if not text.strip():
+                        logger.warning(f"SKIPPED empty: {path}")
+                        continue
+                except UnicodeDecodeError:
+                    logger.debug(f"Could not decode {path.name} as UTF-8.")
+                    charset_match = from_bytes(data).best()
+                    if not charset_match:  # pragma: no cover
+                        logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
+                        continue
+                    logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
+                    text = str(charset_match)
+                sct_meta = create(text, granular=args.granular, bits=args.bits)
+                if args.granular:
+                    print(repr(sct_meta))
+                else:
+                    print(sct_meta.iscc)
+if __name__ == "__main__":  # pragma: no cover
+    main()

iscc_sct/code_semantic_text.py ADDED Viewed

	@@ -0,0 +1,338 @@

+# -*- coding: utf-8 -*-
+"""*A cross-lingual semantic similarity preserving hash for plain-text content (soft hash).*
+The ISCC Text-Code Semantic is a content-based compact binary code generated from multilingual text.
+!!! Warning
+    This is a non-standard Proof of Concept implementation.
+    Plain-text extraction from documents in various formats (especially PDF) may
+    yield different results depending on the extraction tools being used.
+    The [iscc-sdk](https://github.com/iscc/iscc-sdk) uses [Apache Tika](https://tika.apache.org)
+    to extract text from documents for Text-Code generation.
+**Algorithm overview**
+- Split text into semantically coherent overlapping chunks.
+- Create vector embeddings of the chunks.
+- Average and binarize the chunk embeddings.
+- Encode as ISCC-UNIT of MainType SEMANTIC and SubType TEXT
+"""
+from loguru import logger as log
+from onnxruntime.capi.onnxruntime_pybind11_state import NoSuchFile
+from semantic_text_splitter import TextSplitter
+from tokenizers import Tokenizer
+from pathlib import Path
+from typing import Any
+import numpy as np
+import onnxruntime as rt
+from numpy.typing import NDArray
+from functools import cache
+import iscc_sct as sct
+HERE = Path(__file__).parent.absolute()
+__all__ = [
+    "code_text_semantic",
+    "gen_text_code_semantic",
+    "soft_hash_text_semantic",
+    "embed_chunks",
+]
+BIT_LEN_MAP = {
+    32: "0000",
+    64: "0001",
+    96: "0010",
+    128: "0011",
+    160: "0100",
+    192: "0101",
+    224: "0110",
+    256: "0111",
+}
+TOKENIZER_PATH = HERE / "tokenizer.json"
+MAINTYPE = "0001"  # SEMANTIC
+SUBTYPE = "0000"  # TEXT
+SCT_VERSION = "0000"  # V0
+def code_text_semantic(fp, **options):
+    # type: (Path|str, Any) -> dict[str, Any]
+    """
+    Generate ISCC Semantic-Code Text from a text file.
+    NOTE:
+        If you enable generating granular features with `features=True` those features will have
+        the same bit-length as the generated ISCC-UNIT.
+    :param fp: File path of plaintext file to process
+    :param options: Custom processing options for overriding global options
+    :key bits (int): Length of generated Semantic Text-Code in bits (default 64)
+    :key characters (bool): Return document character count (default True).
+    :key embedding (bool): Return global document embedding (default False).
+    :key precision (int): Max fractional digits for embeddings (default 8).
+    :key features (bool): Return granular document features (default False).
+    :key offsets (bool): Return character offsets for granular features (default False).
+    :key chunks (bool): Return text chunks (default False).
+    :key max_tokens (int): Max tokens per chunk (default 127).
+    :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
+    :key trim (int): Trim whitespace from chunks (default False).
+    :return: Dict with ISCC processing results
+    """
+    fp = Path(fp)
+    return gen_text_code_semantic(fp.read_text(encoding="utf-8"), **options)
+def gen_text_code_semantic(text, **options):
+    # type: (str, Any) -> dict
+    """
+    Create an ISCC Semantic-Code Text from plaintext.
+    :param str text: Plaint text for ISCC processing
+    :param options: Custom processing options for overriding global options
+    :key bits (int): Length of generated Semantic Text-Code in bits (default 64)
+    :key characters (bool): Return document character count (default True).
+    :key embedding (bool): Return global document embedding (default False).
+    :key precision (int): Max fractional digits for embeddings (default 8).
+    :key features (bool): Return granular document features (default False).
+    :key offsets (bool): Return character offsets for granular features (default False).
+    :key chunks (bool): Return text chunks (default False).
+    :key max_tokens (int): Max tokens per chunk (default 127).
+    :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
+    :key trim (int): Trim whitespace from chunks (default False).
+    :return: Dict with ISCC processing results (using Index-Format for granular features)
+    """
+    if not text:
+        raise ValueError("Input text cannot be empty.")
+    opts = sct.sct_opts.override(options)
+    result = {"iscc": None}  # Initialize first so `iscc` key is "first" in dict
+    if opts.characters:
+        result["characters"] = len(text)
+    # Text splitting
+    splits = split_text(text, **opts.model_dump())
+    offsets, chunks = [list(item) for item in zip(*splits)]
+    # Chunk embedding
+    with sct.timer("EMBEDDING time"):
+        embeddings = embed_chunks(chunks)
+    # Create global document embedding
+    embedding = mean_pooling(embeddings)
+    if any([opts.simprints, opts.offsets, opts.sizes, opts.contents, opts.embedding]):
+        feature_set = {
+            "maintype": "semantic",
+            "subtype": "text",
+            "version": 0,
+        }
+        if opts.embedding:
+            feature_set["embedding"] = compress(embedding, opts.precision)
+        if opts.simprints:
+            feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
+            feature_set["simprints"] = [sct.encode_base64(digest) for digest in feature_digests]
+        if opts.offsets:
+            feature_set["offsets"] = offsets
+        if opts.sizes:
+            feature_set["sizes"] = [len(chunk) for chunk in chunks]
+        if opts.contents:
+            feature_set["contents"] = chunks
+        result["features"] = [feature_set]
+    # Encode global document embedding
+    length = BIT_LEN_MAP[opts.bits]
+    header = int(MAINTYPE + SUBTYPE + SCT_VERSION + length, 2).to_bytes(2, byteorder="big")
+    digest = binarize(embedding)[: opts.bits // 8]
+    code = sct.encode_base32(header + digest)
+    result["iscc"] = "ISCC:" + code
+    return result
+def soft_hash_text_semantic(text):
+    # type: (str) -> bytes
+    """Creates a 256-bit semantic similarity preserving hash for text input."""
+    chunks = [item[1] for item in split_text(text)]
+    embeddings = embed_chunks(chunks)
+    embedding = mean_pooling(embeddings)
+    digest = binarize(embedding)
+    return digest
+def split_text(text, **options):
+    # type: (str) -> list[tuple[int,str]]
+    """
+    Split text into semantically coherent chunks for embedding.
+    :param text: Text to split.
+    :param options: Custom processing options for overriding global options
+    :key max_tokens (int): Max tokens per chunk (default 127).
+    :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
+    :key trim (int): Trim whitespace from chunks (default False).
+    :return: A list of offset, chunk tuples [(offset,chunk), ...]
+    """
+    opts = sct.sct_opts.override(options)
+    return splitter(**opts.model_dump()).chunk_indices(text)
+@cache
+def tokenizer():
+    # type: () -> Tokenizer
+    """
+    Load and cache the tokenizer model based on the predefined model name.
+    :return: An instance of the Tokenizer.
+    """
+    with sct.timer("TOKENIZER load time"):
+        return Tokenizer.from_file(TOKENIZER_PATH.as_posix())
+@cache
+def splitter(**options):
+    # type: (Any) -> TextSplitter
+    """
+    Load and cache the text splitter, initialized with tokenizer.
+    :param options: Custom processing options for overriding global options
+    :key max_tokens (int): Max tokens per chunk (default 127).
+    :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
+    :key trim (int): Trim whitespace from chunks (default False).
+    :return: An instance of TextSplitter.
+    """
+    opts = sct.sct_opts.override(options)
+    with sct.timer("TEXTSPLITTER load time"):
+        return TextSplitter.from_huggingface_tokenizer(
+            tokenizer(), capacity=opts.max_tokens, overlap=opts.overlap, trim=opts.trim
+        )
+@cache
+def model():
+    # type: () -> rt.InferenceSession
+    """
+    Load and cache the ONNX inference model from a specified path.
+    :return: An ONNX inference session.
+    """
+    available_onnx_providers = rt.get_available_providers()
+    log.debug(f"Available ONNX providers {', '.join(available_onnx_providers)}")
+    selected_onnx_providers = ["CPUExecutionProvider"]
+    if "CUDAExecutionProvider" in available_onnx_providers:  # pragma: no cover
+        selected_onnx_providers.insert(0, "CUDAExecutionProvider")
+    log.debug(f"Using ONNX providers {', '.join(selected_onnx_providers)}")
+    so = rt.SessionOptions()
+    so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
+    try:
+        with sct.timer("ONNXMODEL load time"):
+            return rt.InferenceSession(sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers)
+    except NoSuchFile:  # pragma: no cover
+        with sct.timer("ONNXMODEL aquisition/load time"):
+            model_path = sct.get_model()
+            return rt.InferenceSession(model_path, sess_options=so, providers=selected_onnx_providers)
+def tokenize_chunks(chunks):
+    # type: (list[str]) -> dict
+    """
+    Tokenize text chunks into model-compatible formats.
+    :param chunks: Text chunks to tokenize.
+    :return: Dictionary of tokenized data including input IDs, attention masks, and type IDs.
+    """
+    encodings = tokenizer().encode_batch(chunks)
+    input_ids = np.array([encoding.ids for encoding in encodings], dtype=np.int64)
+    attention_mask = np.array([encoding.attention_mask for encoding in encodings], dtype=np.int64)
+    type_ids = np.array([encoding.type_ids for encoding in encodings], dtype=np.int64)
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": type_ids}
+def embed_chunks(chunks, batch_size=100):
+    """
+    Embed text chunks and return vector embeddings.
+    :param chunks: Text chunks to embed.
+    :param batch_size: Number of chunks to process in each batch.
+    :return: An array of embeddings for each chunk.
+    """
+    embeddings = []
+    for start_idx in range(0, len(chunks), batch_size):
+        batch_chunks = chunks[start_idx : start_idx + batch_size]
+        tokens = tokenize_chunks(batch_chunks)
+        token_embeddings = embed_tokens(tokens)
+        batch_embeddings = attention_pooling(token_embeddings, tokens["attention_mask"])
+        embeddings.append(batch_embeddings)
+    return np.vstack(embeddings)
+def embed_tokens(tokens):
+    # type: (dict) -> NDArray
+    """
+    Create embeddings from tokenized text chunks using the model.
+    :param tokens: Tokenized text data.
+    :return: An array of embeddings.
+    """
+    result = model().run(None, tokens)
+    return np.array(result[0])
+def attention_pooling(token_embeddings, attention_mask):
+    # type: (np.array, np.array) -> np.array
+    """
+    Apply attention mask based mean pooling to the token embeddings.
+    :param token_embeddings: Raw token embeddings from the model.
+    :param attention_mask: Attention masks for the embeddings.
+    :return: An array of pooled and normalized embeddings.
+    """
+    input_mask_expanded = attention_mask[:, :, None].astype(np.float32)
+    sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
+    sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_min=1e-9, a_max=None)
+    mean_pooled = sum_embeddings / sum_mask
+    norm = np.linalg.norm(mean_pooled, ord=2, axis=1, keepdims=True)
+    result = mean_pooled / np.clip(norm, a_min=1e-9, a_max=None)
+    return result.astype(np.float32)
+def mean_pooling(embeddings):
+    # type: (NDArray[np.float32]) -> NDArray
+    """
+    Calculate the document vector from chunk embeddings using mean pooling.
+    :param embeddings: Chunk embeddings.
+    :return: A normalized document vector.
+    """
+    document_vector = embeddings.mean(axis=0)
+    return document_vector / np.linalg.norm(document_vector)
+def binarize(vec):
+    # type: (NDArray) -> bytes
+    """
+    Binarize an embedding vector into a hash digest.
+    :param vec: Vector to be binarized.
+    :return: A bytes object representing the binary hash.
+    """
+    return bytes((np.packbits(np.array(vec) >= 0)))
+def compress(vec, precision):
+    # type: (NDArray, int) -> list[float]
+    """
+    Round down vector values to specified precision to reduce storage requirements.
+    :param vec: Embedding vector.
+    :param precision: Max number of fractional decimal places.
+    :return: Vector as native python list of rounded floats.
+    """
+    rounded_array = np.around(vec, decimals=precision)
+    compress_list = [round(x, precision) for x in rounded_array.tolist()]
+    return compress_list

iscc_sct/demo.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""
+Gradio demo showcasing ISCC Semantic Text Code.
+The demo features:
+- two side by side text inputs.
+- One sample text per input (One sample in english and the other a german translation of it)
+- One slider to set global bitlength (32-256 bits in steps of 32 with 64 as default)
+- One result output per text input
+The user can select the samples or write or paste text into the inputs and generate ISCC Semantic
+Text Codes for the Texts. Below the result outputs we show the similarity of the two codes.
+"""
+from loguru import logger as log
+import gradio as gr
+import iscc_sct as sct
+def compute_iscc_code(text1, text2, bit_length):
+    code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
+    code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
+    similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
+    return code1["iscc"], code2["iscc"], similarity
+def compare_codes(code_a, code_b, bits):
+    if all([code_a, code_b]):
+        return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
+def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
+    """Aproximate the cosine similarity for a given hamming distance and dimension"""
+    result = 1 - (2 * hamming_distance) / dim
+    return result
+def generate_similarity_bar(similarity):
+    """Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
+    # Scale similarity from [-1, 1] to [-100, 100]
+    display_similarity = similarity * 100
+    # Calculate the width of the bar based on the absolute value of similarity
+    bar_width = int(abs(similarity) * 50)  # 50% is half the width of the container
+    # Determine the color and starting position based on the sign of the similarity
+    color = "green" if similarity >= 0 else "red"
+    position = "left" if similarity >= 0 else "right"
+    # Adjust the text position to be centered within the colored bar
+    text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
+    text_alignment = "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
+    bar_html = f"""
+    <h3>Semantic Similarity</h3>
+    <div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
+        <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
+            <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
+        </div>
+    </div>
+    """
+    return bar_html
+# Sample texts
+sample_text_en = "This is a sample text in English to demonstrate the ISCC-CODE generation."
+sample_text_de = "Dies ist ein Beispieltext auf Deutsch, um die Erzeugung von ISCC-CODES zu demonstrieren."
+custom_css = """
+#chunked-text span.label {
+    text-transform: none !important;
+}
+"""
+iscc_theme = gr.themes.Default(
+    font=[gr.themes.GoogleFont("Readex Pro")],
+    font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
+    radius_size=gr.themes.sizes.radius_none,
+)
+with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
+    with gr.Row(variant="panel"):
+        gr.Markdown(
+            """
+        ## ✂️ ISCC Semantic Text-Code
+        Demo of cross-lingual Semantic Text-Code (proof of concept)
+        """,
+        )
+    with gr.Row(variant="panel"):
+        in_iscc_bits = gr.Slider(
+            label="ISCC Bit-Length",
+            info="NUMBER OF BITS FOR OUTPUT ISCC",
+            minimum=64,
+            maximum=256,
+            step=32,
+            value=64,
+        )
+    with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            in_text_a = gr.TextArea(
+                label="Text",
+                placeholder="Paste your text here or select sample from below",
+                lines=12,
+                max_lines=12,
+            )
+            gr.Examples(label="Sample Text", examples=[sample_text_en], inputs=[in_text_a])
+            out_code_a = gr.Textbox(label="ISCC Code for Text A")
+        with gr.Column(variant="panel"):
+            in_text_b = gr.TextArea(
+                label="Text",
+                placeholder="Paste your text here or select sample from below",
+                lines=12,
+                max_lines=12,
+            )
+            gr.Examples(label="Sample Text", examples=[sample_text_de], inputs=[in_text_b])
+            out_code_b = gr.Textbox(label="ISCC Code for Text B")
+    with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            out_similarity = gr.HTML(label="Similarity")
+    def process_text(text, nbits, suffix):
+        log.debug(f"{text[:20]}")
+        if not text:
+            return
+        out_code_func = globals().get(f"out_code_{suffix}")
+        iscc = sct.Metadata(**sct.gen_text_code_semantic(text, bits=nbits))
+        result = {out_code_func: gr.Textbox(value=iscc.iscc)}
+        return result
+    in_text_a.change(
+        lambda text, nbits: process_text(text, nbits, "a"),
+        inputs=[in_text_a, in_iscc_bits],
+        outputs=[out_code_a],
+        show_progress="full",
+    )
+    in_text_b.change(
+        lambda text, nbits: process_text(text, nbits, "b"),
+        inputs=[in_text_b, in_iscc_bits],
+        outputs=[out_code_b],
+        show_progress="full",
+    )
+    out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
+    out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
+    with gr.Row():
+        gr.ClearButton(components=[in_text_a, in_text_b])
+if __name__ == "__main__":  # pragma: no cover
+    demo.launch()

iscc_sct/main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from iscc_sct.models import Metadata
+from iscc_sct.code_semantic_text import gen_text_code_semantic
+from iscc_sct.options import sct_opts
+__all__ = [
+    "create",
+]
+def create(text, granular=False, **options):
+    # type (str, bool) -> Metadata
+    """
+    Create Semantic Text-Code
+    High-Level API for creating Semantic Text-Code.
+    :param text: Text used for creating Semantic Text-Code.
+    :param granular: Activate options for granular processing (Default: False).
+    :param options: Override individual options for creating Semantic Text-Code.
+    :return: Semantic Text-Code `Metadata` object in Object-Format
+    """
+    # Override global options with individual options derived from `granular` parameter
+    granular = dict(simprints=True, offsets=True, sizes=True, contents=True) if granular else {}
+    opts = sct_opts.override(granular)
+    # Override local options with individual options form additional keyword arguments
+    opts = opts.override(options)
+    data = gen_text_code_semantic(text, **opts.model_dump())
+    return Metadata(**data).to_object_format()

iscc_sct/models.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+# Semantic-Code Text - Datamodel
+This module provides the pydantic metadata schema for Semantic Text Code results.
+The schema is conformant with https://schema.iscc.codes/
+The `features` property of the top level Metadata Object supports two different formats for
+representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**.
+These formats are designed to offer flexibility in how feature data is structured and processed,
+catering to different use cases where either performance or clarity is prioritized.
+## Features Index-Format (Compact Array Structure):
+In this compact format, features are represented as a list of strings, with optional parallel arrays to
+store related attributes such as `offsets`, `sizes`, and `contents`.
+**Example**:
+```json
+{
+    "maintype": "semantic",
+    "subtype": "text",
+    "version": 0,
+    "simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"],
+    "offsets": [0, 12],
+    "sizes": [12, 48],
+    "contents": ["textchunk no one", "textchunk no two"]
+}
+```
+**Use Case**:
+- Best suited for scenarios where storage efficiency is critical, and the overhead of processing
+  multiple parallel arrays is acceptable.
+- Useful when all features share the same set of attributes, allowing for faster bulk processing.
+## Features Object-Format (Self-Descriptive Object Structure):
+In this convenient format, each feature is represented as an individual object containing its
+attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but
+easier to read and work with.
+**Example**:
+```json
+{
+    "maintype": "content",
+    "subtype": "text",
+    "version": 0,
+    "simprints": [
+        {
+            "simprint": "lUjuScFYBik",
+            "offset": 0,
+            "size": 25,
+            "content": "ISCC - Semantic Text-Code"
+        }
+    ]
+}
+```
+**Use Case**:
+- Ideal for scenarios where clarity and readability are prioritized.
+- Each feature is self-contained, making it easier to understand, extend, and debug.
+- Flexibility in including or omitting optional attributes per feature.
+### Unified FeatureSet Schema:
+The `FeatureSet` model unifies these two formats by allowing either structure to be used.
+To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
+"""
+from typing import List, Optional, Dict, Any, Union
+from pydantic import BaseModel
+__all__ = ["Feature", "FeatureSet", "Metadata"]
+class PrettyBaseModel(BaseModel):
+    def __repr__(self):
+        return self.pretty_repr()
+    def pretty_repr(self):
+        return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)
+class Feature(PrettyBaseModel):
+    simprint: str
+    offset: Optional[int] = None
+    size: Optional[int] = None
+    content: Optional[str] = None
+class FeatureSet(PrettyBaseModel):
+    maintype: str = "semantic"
+    subtype: str = "text"
+    version: int = 0
+    embedding: Optional[List[float]] = None
+    simprints: Optional[
+        Union[
+            List[str],  # Index-Format
+            List[Feature],  # Object-Format
+        ]
+    ] = None
+    offsets: Optional[List[int]] = None
+    sizes: Optional[List[int]] = None
+    contents: Optional[List[str]] = None
+class Metadata(PrettyBaseModel):
+    iscc: str
+    characters: Optional[int] = None
+    features: Optional[List[FeatureSet]] = None
+    def to_index_format(self) -> "Metadata":
+        """
+        Convert the Metadata object to use the Index-Format for features.
+        Returns a new Metadata object.
+        """
+        if not self.features:
+            return self.model_copy()
+        new_features = []
+        for feature_set in self.features:
+            new_feature_set = feature_set.model_copy()
+            if feature_set.simprints is None:
+                new_features.append(new_feature_set)
+                continue
+            if isinstance(feature_set.simprints[0], str):
+                new_features.append(new_feature_set)
+            else:
+                new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
+                new_feature_set.offsets = [f.offset for f in feature_set.simprints if f.offset is not None]
+                new_feature_set.sizes = [f.size for f in feature_set.simprints if f.size is not None]
+                new_feature_set.contents = [f.content for f in feature_set.simprints if f.content is not None]
+                new_features.append(new_feature_set)
+        return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
+    def to_object_format(self) -> "Metadata":
+        """
+        Convert the Metadata object to use the Object-Format for features.
+        Returns a new Metadata object.
+        """
+        if not self.features:
+            return self.model_copy()
+        new_features = []
+        for feature_set in self.features:
+            new_feature_set = feature_set.model_copy()
+            if feature_set.simprints is None:
+                new_features.append(new_feature_set)
+                continue
+            if isinstance(feature_set.simprints[0], Feature):
+                new_features.append(new_feature_set)
+            else:
+                new_simprints = []
+                for i, simprint in enumerate(feature_set.simprints):
+                    feature = Feature(simprint=simprint)
+                    if feature_set.offsets and i < len(feature_set.offsets):
+                        feature.offset = feature_set.offsets[i]
+                    if feature_set.sizes and i < len(feature_set.sizes):
+                        feature.size = feature_set.sizes[i]
+                    if feature_set.contents and i < len(feature_set.contents):
+                        feature.content = feature_set.contents[i]
+                    new_simprints.append(feature)
+                new_feature_set.simprints = new_simprints
+                new_feature_set.offsets = None
+                new_feature_set.sizes = None
+                new_feature_set.contents = None
+                new_features.append(new_feature_set)
+        return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)

iscc_sct/options.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from dotenv import load_dotenv
+from pydantic import Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+__all__ = [
+    "SctOptions",
+    "sct_opts",
+]
+load_dotenv()
+class SctOptions(BaseSettings):
+    bits: int = Field(
+        64,
+        description="ISCC_SCT_BITS - Default bit-length of generated Semantic Text-Code in bits",
+        ge=32,
+        le=256,
+        multiple_of=32,
+    )
+    bits_granular: int = Field(
+        64,
+        description="ISCC_SCT_BITS_GRANULAR - Default bit-length of granular features",
+        ge=32,
+        le=256,
+        multiple_of=32,
+    )
+    characters: bool = Field(True, description="ISCC_SCT_CHARACTERS - Include document character count")
+    embedding: bool = Field(False, description="ISCC_SCT_EMBEDDING - Include global document embedding")
+    precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")
+    simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
+    offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")
+    sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")
+    contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
+    max_tokens: int = Field(
+        127,
+        description="ISCC_SCT_MAX_TOKENS - Max tokens per chunk (Default 127)",
+        le=127,
+    )
+    overlap: int = Field(
+        48,
+        description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
+    )
+    trim: bool = Field(False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)")
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        env_prefix="ISCC_SCT_",
+        extra="ignore",
+        validate_assignment=True,
+    )
+    def override(self, update=None):
+        # type: (dict|None) -> SctOptions
+        """Returns an updated and validated deep copy of the current settings instance."""
+        update = update or {}  # sets {} if update is None
+        opts = self.model_copy(deep=True)
+        # We need update fields individually so validation gets triggered
+        for field, value in update.items():
+            setattr(opts, field, value)
+        return opts
+sct_opts = SctOptions()

iscc_sct/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

iscc_sct/utils.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import math
+from base64 import b32encode, b32decode
+from pybase64 import urlsafe_b64encode, urlsafe_b64decode
+from loguru import logger as log
+import os
+import time
+from pathlib import Path
+from urllib.request import urlretrieve
+from blake3 import blake3
+from platformdirs import PlatformDirs
+APP_NAME = "iscc-sct"
+APP_AUTHOR = "iscc"
+dirs = PlatformDirs(appname=APP_NAME, appauthor=APP_AUTHOR)
+os.makedirs(dirs.user_data_dir, exist_ok=True)
+__all__ = [
+    "timer",
+    "get_model",
+    "encode_base32",
+    "encode_base64",
+    "hamming_distance",
+    "iscc_distance",
+    "MODEL_PATH",
+]
+BASE_VERSION = "1.0.0"
+BASE_URL = f"https://github.com/iscc/iscc-binaries/releases/download/v{BASE_VERSION}"
+MODEL_FILENAME = "iscc-sct-v0.1.0.onnx"
+MODEL_URL = f"{BASE_URL}/{MODEL_FILENAME}"
+MODEL_PATH = Path(dirs.user_data_dir) / MODEL_FILENAME
+MODEL_CHECKSUM = "ff254d62db55ed88a1451b323a66416f60838dd2f0338dba21bc3b8822459abc"
+class timer:
+    def __init__(self, message: str):
+        self.message = message
+    def __enter__(self):
+        # Record the start time
+        self.start_time = time.perf_counter()
+    def __exit__(self, exc_type, exc_value, traceback):
+        # Calculate the elapsed time
+        elapsed_time = time.perf_counter() - self.start_time
+        # Log the message with the elapsed time
+        log.debug(f"{self.message} {elapsed_time:.4f} seconds")
+def get_model():  # pragma: no cover
+    """Check and return local model file if it exists, otherwise download."""
+    if MODEL_PATH.exists():
+        try:
+            return check_integrity(MODEL_PATH, MODEL_CHECKSUM)
+        except RuntimeError:
+            log.warning("Model file integrity error - redownloading ...")
+            urlretrieve(MODEL_URL, filename=MODEL_PATH)
+    else:
+        log.info("Downloading embedding model ...")
+        urlretrieve(MODEL_URL, filename=MODEL_PATH)
+    return check_integrity(MODEL_PATH, MODEL_CHECKSUM)
+def check_integrity(file_path, checksum):
+    # type: (str|Path, str) -> Path
+    """
+    Check file integrity against blake3 checksum
+    :param file_path: path to file to be checked
+    :param checksum: blake3 checksum to verify integrity
+    :raises RuntimeError: if verification fails
+    """
+    file_path = Path(file_path)
+    file_hasher = blake3(max_threads=blake3.AUTO)
+    with timer("INTEGRITY check time"):
+        file_hasher.update_mmap(file_path)
+        file_hash = file_hasher.hexdigest()
+    if checksum != file_hash:
+        msg = f"Failed integrity check for {file_path.name}"
+        log.error(msg)
+        raise RuntimeError(msg)
+    return file_path
+def encode_base32(data):
+    # type: (bytes) -> str
+    """
+    Standard RFC4648 base32 encoding without padding.
+    :param bytes data: Data for base32 encoding
+    :return: Base32 encoded str
+    """
+    return b32encode(data).decode("ascii").rstrip("=")
+def decode_base32(code):
+    # type: (str) -> bytes
+    """
+    Standard RFC4648 base32 decoding without padding and with casefolding.
+    """
+    # python stdlib does not support base32 without padding, so we have to re-pad.
+    cl = len(code)
+    pad_length = math.ceil(cl / 8) * 8 - cl
+    return bytes(b32decode(code + "=" * pad_length, casefold=True))
+def encode_base64(data):
+    # type: (bytes) -> str
+    """
+    Standard RFC4648 base64url encoding without padding.
+    """
+    code = urlsafe_b64encode(data).decode("ascii")
+    return code.rstrip("=")
+def decode_base64(code):
+    # type: (str) -> bytes
+    """
+    Standard RFC4648 base64url decoding without padding.
+    """
+    padding = 4 - (len(code) % 4)
+    string = code + ("=" * padding)
+    return urlsafe_b64decode(string)
+def hamming_distance(a, b):
+    # type: (bytes, bytes) -> int
+    """
+    Calculate the bitwise Hamming distance between two bytes objects.
+    :param a: The first bytes object.
+    :param b: The second bytes object.
+    :return:  The Hamming distance between two bytes objects.
+    :raise ValueError: If a and b are not the same length.
+    """
+    if len(a) != len(b):
+        raise ValueError("The lengths of the two bytes objects must be the same")
+    distance = 0
+    for b1, b2 in zip(a, b):
+        xor_result = b1 ^ b2
+        distance += bin(xor_result).count("1")
+    return distance
+def iscc_distance(iscc1, iscc2):
+    # type: (str, str) -> int
+    """
+    Calculate the Hamming distance between two ISCC Semantic Text Codes.
+    :param iscc1: The first ISCC Semantic Text Code.
+    :param iscc2: The second ISCC Semantic Text Code.
+    :return: The Hamming distance between the two ISCC codes.
+    :raise ValueError: If the input ISCCs are not valid or of different lengths.
+    """
+    # Remove the "ISCC:" prefix if present
+    iscc1 = iscc1[5:] if iscc1.startswith("ISCC:") else iscc1
+    iscc2 = iscc2[5:] if iscc2.startswith("ISCC:") else iscc2
+    # Decode the base32-encoded ISCCs
+    decoded1 = decode_base32(iscc1)
+    decoded2 = decode_base32(iscc2)
+    # Check if the decoded ISCCs have the same length
+    if len(decoded1) != len(decoded2):
+        raise ValueError("The input ISCCs must have the same length")
+    # Remove the 2-byte header from each decoded ISCC
+    content1 = decoded1[2:]
+    content2 = decoded2[2:]
+    # Calculate and return the Hamming distance
+    return hamming_distance(content1, content2)

poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,94 @@

+[tool.poetry]
+name = "iscc-sct"
+version = "0.1.2"
+description = "ISCC - Semantic Code Text"
+authors = ["Titusz <[email protected]>"]
+license = "CC-BY-NC-SA-4.0"
+readme = "README.md"
+homepage = "https://iscc.codes"
+repository = "https://github.com/iscc/iscc-sct"
+documentation = "https://github.com/iscc/iscc-sct"
+keywords=["iscc", "text similarity", "cross lingual", "semantic similarity"]
+classifiers=[
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Text Processing",
+    "Topic :: Text Processing :: General",
+    "Topic :: Text Processing :: Indexing",
+    "Topic :: Text Processing :: Linguistic",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Multimedia :: Graphics",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+    "Topic :: Scientific/Engineering :: Information Analysis",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: System :: Archiving",
+    "Topic :: System :: Clustering",
+    "Topic :: System :: Distributed Computing",
+]
+[tool.poetry.urls]
+"Changelog" = "https://github.com/iscc/iscc-sct/blob/main/CHANGELOG.md"
+"Bug Tracker" = "https://github.com/iscc/iscc-sct/issues"
+"Twitter" = "https://twitter.com/iscc_foundation"
+"Donate" = "https://iscc.foundation/support"
+[tool.poetry.scripts]
+sct = 'iscc_sct.cli:main'
+[tool.poetry.dependencies]
+python = ">=3.9,<3.13"
+semantic-text-splitter = "*"
+onnxruntime = "*"
+onnxruntime-gpu = { version = "*", optional = true }
+loguru = "*"
+blake3 = "*"
+platformdirs = "*"
+tokenizers = "*"
+pydantic-settings = "*"
+charset-normalizer = "*"
+numpy = "<2.0.0"
+pybase64 = "^1.4.0"
+certifi = ">=2024.07.04"
+gradio = { version = "*", optional = true }
+[tool.poetry.extras]
+gpu = ["onnxruntime-gpu"]
+demo = ["gradio"]
+[tool.poetry.group.test.dependencies]
+pytest = "*"
+coverage = "*"
+pytest-cov = "*"
+[tool.poetry.group.dev.dependencies]
+poethepoet = "*"
+ruff = "*"
+mdformat-gfm = "*"
+mdformat-gfm-alerts = "*"
+[tool.ruff]
+line-length = 119
+[tool.ruff.format]
+line-ending = "lf"
+[tool.poe.tasks]
+format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
+format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
+test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100 --cov-report=term-missing --color=yes", help = "Run tests with coverage" }
+all = ["format-code", "format-markdown", "test"]
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"

space.yml ADDED Viewed

	@@ -0,0 +1,44 @@

+title: ISCC-LAB - Semantic-Code Text
+emoji: ▶️
+colorFrom: red
+colorTo: blue
+sdk: gradio
+sdk_version: 4.41.0
+app_file: iscc_sct/demo.py
+pinned: true
+license: CC-BY-NC-SA-4.0
+python_version: 3.12
+short_description: Cross Lingual Similarity Preserving Text Simprints
+description: >
+  # ISCC-LAB - Semantic-Code Text
+  `iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
+  [ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
+  short identifiers created from text documents that preserve similarity (in hamming distance)
+  for semantically similar cross-lingual text inputs.
+  ## What is the ISCC
+  The ISCC is a combination of various similarity preserving fingerprints and an identifier for
+  digital media content.
+  ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However,
+  instead of using a single cryptographic hash function to identify data only, the ISCC uses various
+  algorithms to create a composite identifier that exhibits similarity-preserving properties (soft
+  hash or Simprint).
+  The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each
+  component is self-describing, modular, and can be used separately or with others to aid in various
+  content identification tasks. The algorithmic design supports content deduplication, database
+  synchronization, indexing, integrity verification, timestamping, versioning, data provenance,
+  similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and
+  general digital asset management use-cases.
+  ## ISCC Status
+  The [ISCC](https://iscc.codes) is an ISO Standrad published under
+  [ISO 24138:2024](https://www.iso.org/standard/77899.html) - International Standard Content Code
+  within [ISO/TC 46/SC 9/WG 18](https://www.iso.org/committee/48836.html).
+  The algorithms of this `iscc-sct` are experimental and not (yet) part of the official standard.

tests/__init__.py ADDED Viewed

File without changes

tests/benchmark.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# -*- coding: utf-8 -*-
+from loguru import logger as log
+from pathlib import Path
+import iscc_sct as sct
+import argparse
+import time
+def benchmark(folder):
+    """
+    Benchmark Text-Code generation for all text files in `folder`.
+    Per file stats are logged to the console during processing.
+    Comprehensive aggregated statistics are shown after processing all images
+    :param folder: Folder containing text files for benchmarking
+    """
+    folder = Path(folder)
+    assert folder.is_dir(), f"{folder} is not a directory."
+    total_time = 0
+    file_count = 0
+    for txt_path in folder.glob("*.txt"):
+        start_time = time.time()
+        try:
+            iscc_meta = sct.code_text_semantic(txt_path)
+        except Exception as e:
+            log.error(f"Processing {txt_path.name} failed: {e}")
+            continue
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        total_time += elapsed_time
+        file_count += 1
+        log.info(f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}")
+    if file_count > 0:
+        avg_time = total_time / file_count
+        log.info(
+            f"Processed {file_count} files in {total_time:.2f} seconds. Average time per file: {avg_time:.2f} seconds."
+        )
+    else:
+        log.warning("No text files found in the provided folder.")
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
+    parser.add_argument("folder", type=str, help="Directory containing text files for benchmarking.")
+    args = parser.parse_args()
+    benchmark(args.folder)
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import pytest
+from pathlib import Path
+HERE = Path(__file__).parent.absolute()
+@pytest.fixture
+def text_en():
+    return (HERE / "en.txt").read_text(encoding="utf-8")
+@pytest.fixture
+def text_de():
+    return (HERE / "de.txt").read_text(encoding="utf-8")

tests/de.txt ADDED Viewed

	@@ -0,0 +1,194 @@

+ Vielen Dank, Chris.
+  Es ist mir wirklich eine Ehre, zweimal auf dieser Bühne stehen zu dürfen.
+  Tausend Dank dafür.
+ Ich bin wirklich begeistert von dieser Konferenz, und ich danke Ihnen allen für die vielen netten Kommentare zu meiner Rede vorgestern Abend.
+ Das meine ich ernst, teilweise deshalb -- weil ich es wirklich brauchen kann!
+ (Lachen) Versetzen Sie sich mal in meine Lage!
+ (Lachen) (Applaus) Ich bin bin acht Jahre lang mit der Air Force Two geflogen.
+ Jetzt muss ich meine Schuhe ausziehen, um überhaupt an Bord zu kommen!
+ (Applaus) Ich erzähle Ihnen mal eine Geschichte, dann verstehen Sie mich vielleicht besser.
+ Eine wahre Geschichte -- kein Wort daran ist erfunden.
+ Kurz nachdem Tipper und ich aus dem (vorgetäuschtes Schluchzen) Weißen Haus ausgezogen waren, fuhren wir von unserem Haus in Nashville zu unserer kleinen Farm 50 Meilen östlich von Nashville --
+ und wir fuhren selbst.
+  (Lachen) Ich weiß, für Sie ist das nichts Ungewöhnliches, aber ...
+  (Lachen) Ich sah in den Rückspiegel und plötzlich traf mich eine Erkenntnis.
+ Hinter mir war gar keine Autokolonne.
+ Haben Sie schon mal vom Phantomschmerz gehört?
+ (Lachen)
+  Wir saßen in einem gemieteten Ford Taurus.
+  Es war Zeit zum Abendessen und wir hielten Ausschau nach einem Restaurant.
+ Wir waren auf der I-40.
+ Wir kamen zur Ausfahrt 238, Lebanon, Tennessee.
+ Wir fuhren ab und suchten nach einem ... wir fanden schließlich ein Shoney's.
+ Für alle, die es nicht kennen: Das ist eine billige Familienrestaurantkette.
+  Wir gingen rein und setzten uns in eine Nische.
+  Die Kellnerin kam zu uns und machte viel Aufhebens um Tipper.
+ Sie nahm unsere Bestellung auf, ging dann zum Paar in der Nische neben uns und senkte ihre Stimme so sehr, dass ich mich richtig anstrengen musste, um sie zu verstehen.
+ Sie sagte: "Ja, das ist Ex-Vizepräsident Al Gore und seine Frau Tipper."
+ Und der Mann antwortete: "Ganz schöner Abstieg, was?"
+ (Lachen) Es gab eine ganze Reihe solcher Offenbarungen.
+ Am nächsten Tag -- immer noch eine wahre Geschichte! -- flog ich in einer G5 nach Afrika, um in Nigeria eine Rede zu halten, in Lagos, und zwar über das Thema Energie.
+ Zu Beginn der Rede erzählte ich, was mir am Vortag in Nashville passiert war.
+  Ich erzählte es genau so, wie ich es Ihnen gerade erzählt habe.
+  Tipper und ich fuhren selbst, Shoney's, billige Familienrestaurantkette, was der Mann gesagt hatte -- alle lachten.
+ Ich hielt meine Rede, dann fuhr ich zurück zum Flughafen, um nach Hause zu fliegen.
+ Im Flugzeug schlief ich, bis wir mitten in der Nacht auf den Azoren landeten, um zu tanken.
+  Ich wachte auf, öffnete die Tür und ging hinaus, um frische Luft zu schnappen.
+  Da sah ich plötzlich einen Mann über das Rollfeld rennen.
+  Er wedelte mit einem Stück Papier und schrie: "Rufen Sie Washington an!
+  Rufen Sie Washington an!"
+  Ich dachte so: Mitten in der Nacht, mitten im Atlantik, was in der Welt könnte in Washington schief laufen?
+  Dann fiel mir ein, dass da so einiges in Frage kam.
+ (Lachen)
+  (Applaus) Aber mein Mitarbeiter war wegem Folgenden so aufgeregt: Eine der nigerianischen Nachrichtenagenturen hatte schon eine Story über meine Rede herausgegeben.
+  Und die war schon in Städten überall in den USA gedruckt worden --
+ auch in Monterey, das habe ich überprüft.
+  Und die Geschichte begann mit: "Ex-Vizepräsident Al Gore gab gestern in Nigeria bekannt: 'Meine Frau Tipper und ich haben ein billiges Familienrestaurant namens Shoney's eröffnet und wir führen es selbst.'"
+  Bevor ich wieder amerikanischen Boden betrat, machten David Letterman und Jay Leno schon Witze über mich -- einer von ihnen zeigte mich mit einer großen weißen Kochmütze und Tipper sagte: "Noch einen Burger mit Pommes!"
+ Drei Tage später bekam ich einen netten, langen, handgeschriebenen Brief von meinem Freund, Partner und Kollegen Bill Clinton, in dem er schrieb: "Glückwunsch zum neuen Restaurant, Al!"
+ (Lachen) Wir freuen uns immer, wenn der andere Erfolg im Leben hat.
+ Ich wollte eigentlich über Informationsökologie sprechen.
+ Aber ich dachte, da ich ohnehin noch sehr oft zu TED zurückkommen will, könnte ich das vielleicht auf ein anderes Mal verschieben.
+ Chris Anderson: Abgemacht!
+  Ich möchte mich auf das konzentrieren, was viele von Ihnen von mir hören wollen.
+  Was kann jeder Einzelne gegen die Klimakrise tun?
+  Ich möchte beginnen mit ...
+  Ich werde einige neue Bilder zeigen und nur vier oder fünf noch mal durchgehen.
+ Ein Wort zur Diashow.
+ Ich aktualisiere sie jedes Mal, bevor ich sie zeige.
+ Ich füge neue Bilder hinzu, weil ich jedes Mal wieder etwas dazulerne.
+ Wie beim Strandgutsammeln -- jedes Mal, wenn die Flut da war,
+ findet man neue Muschelschalen.
+ Erst in den letzten beiden Tagen hatten wir neue Januar-Temperaturrekorde.
+ Das gilt jetzt nur für die USA.
+  Der historische Durchschnitt für Januar liegt bei minus 0,6 Grad.
+  Im letzten Monat waren es plus 4,2 Grad.
+ Ich weiß ja, dass Sie auf weitere schlechte Umweltnachrichten warten -- Ich mache nur Spaß --
+ aber jetzt kommt erst mal eine kurze Wiederholung und dann zeige ich Ihnen neues Material über mögliche Lösungen.
+ Aber erst wollte ich zu einigen Dias noch etwas sagen.
+ Zunächst steuern wir hier mit dem US-Beitrag zur Erderwärmung hin, wenn nichts unternommen wird.
+ Endverbraucher-Effizienz bei Strom und anderen Energien, das sind die niedrig hängenden Trauben.
+ Effizienz und Umweltschutz: Das ist kein Kostenfaktor, sondern ein Gewinnfaktor.
+ Das Vorzeichen ist falsch.
+ Es ist nicht negativ, sondern positiv.
+ Diese Investitionen amortisieren sich von selbst.
+ Aber sie lenken uns auch sehr effektiv vom richtigen Weg ab.
+ Autos und LKW -- darüber habe ich in der Diashow schon gesprochen, aber ich möchte, dass Sie es im rechten Licht betrachten.
+ Das ist ein einfacher, sichtbarer Kritikpunkt, und so sollte es auch sein, aber Gebäude haben einen größeren Anteil an der Erderwärmung als Autos und LKW.
+ Autos und LKW sind sehr wichtig, und wir haben die weltweit niedrigsten Normen,
+ daher sollten wir das Thema anpacken.
+  Aber es ist nur ein Teil des Ganzen.
+  Die Effizienz anderer Transportmittel ist ebenso wichtig wie bei Autos und LKW!
+ Erneuerbare Energien können bei der derzeitigen Technologieeffizienz einiges ausmachen, und nach den Aussagen von Vinod, John Doerr und anderen,
+ vielen von Ihnen -- hier sind viele Menschen direkt beteiligt -- wird dieser Keil viel schneller wachsen, als die aktuelle Projektion zeigt.
+ Die CO2-Sequestrierung -- abgekürzt CCS -- wird sich wahrscheinlich zum ultimativen Werkzeug entwickeln, mit dem wir fossile Brennstoffe auf sichere Weise weiterhin nutzen können.
+ Da sind wir noch nicht ganz.
+ Was kann nun der Einzelne tun?
+ Emissionen im eigenen Haus reduzieren.
+ Die meisten dieser Ausgaben sparen langfristig auch Geld.
+ Isolierung, besseres Baudesign, kaufen Sie möglichst umweltfreundlichen Strom.
+ Ich sprach von Autos -- kaufen Sie eins mit Hybridantrieb.
+ Nutzen Sie den öffentlichen Verkehr.
+ Sehen Sie sich nach anderen, besseren Lösungen um.
+ Das ist wichtig.
+ Kaufen Sie "grün".
+ Bei allem, was Sie einkaufen, haben Sie die Wahl zwischen Produkten mit ungünstigen und deutlich weniger ungünstigen Auswirkungen auf die globale Klimakrise.
+ Entscheiden Sie sich für ein CO2-neutrales Leben.
+ Diejenigen von Ihnen, die sich mit Slogans auskennen, wäre ich sehr dankbar für Tipps und Hilfe, wie man das so formulieren kann, dass es bei der Masse ankommt.
+ Es ist einfacher, als Sie glauben.
+  Wirklich.
+  Viele von uns hier haben diese Entscheidung getroffen, und es ist wirklich nicht schwer.
+ Reduzieren Sie Ihre CO2-Emissionen durch jede Wahl, die Sie treffen können, und kaufen oder erwerben Sie einen Ausgleich für den Rest, den Sie nicht vermeiden können.
+ Genauer wird das auf climatecrisis.net erklärt.
+ Da gibt es einen CO2-Rechner.
+ Participant Productions hat unter meiner aktiven Teilnahme die führenden Programmierer der Welt zusammengerufen, um aus dieser geheimnisvollen Kunst der CO2-Berechnung einen anwenderfreundlichen CO2-Rechner zu basteln.
+ Sie können sehr genau Ihre persönlichen CO2-Emissionen berechnen und erfahren dann Möglichkeiten, sie zu reduzieren.
+ Bis zum Fillmstart im Mai wird es ein Update auf Version 2.0 geben, in der man sich dann direkt zum Kauf von Ausgleichseinheiten durchklicken kann.
+ Versuchen Sie, Ihr Unternehmen CO2-neutral zu führen.
+ Auch das haben einige hier schon getan, und es ist leichter, als man denkt.
+ Beziehen Sie Klimalösungen in Ihre Innovationen mit ein, egal, ob Sie im Bereich Technologie, Unterhaltung oder Bauwesen und Architektur arbeiten.
+ Investieren Sie nachhaltig.
+ Davon hat Majora schon gesprochen.
+ Wenn Sie Geld in Manager investieren, die Sie auf der Grundlage ihrer Jahresleistung entlohnen, dann beklagen Sie sich nie wieder über kurzfristiges Management.
+ Langfristig tun die Leute, wofür man sie bezahlt.
+ Und wenn sie aufgrund von kurzfristigen Gewinnen beurteilen, wie viel sie aus Ihrem investierten Kapital herausholen können, dann treffen sie kurzfristige Entscheidungen.
+ Darüber lässt sich noch so einiges sagen.
+ Werden Sie ein Katalysator für den Wandel.
+ Lehren Sie andere, lernen Sie, reden Sie darüber.
+ Der Film ist eine Filmversion der Diashow, die ich vorgestern gezeigt habe, nur viel unterhaltsamer.
+ Und er kommt im Mai heraus.
+ Viele von Ihnen hier können dafür sorgen, dass eine Menge Leute ihn sehen.
+ Schicken Sie jemanden nach Nashville.
+ Suchen Sie ihn sorgfältig aus.
+ Ich werde persönlich Menschen schulen, diese Diashow zu zeigen, abgewandelt, die persönlichen Geschichten werden natürlich durch etwas Allgemeineres ersetzt, Es geht nicht nur um die Dias, sondern um ihre Bedeutung, ihren Zusammenhang.
+ Daher werde ich in diesem Sommer einen Kurs für eine Gruppe von Menschen abhalten, die von verschiedenen Leuten dafür nominiert werden, diesen Vortrag massenweise in Gemeinden im ganzen Land zu halten, und wir werden die Diashow für alle jede Woche aktualisieren, damit sie immer topaktuell ist.
+ In Zusammenarbeit mit Larry Lessig wird sie demnächst auch mit Tools und eingeschränkten Nutzungsrechten veröffentlicht werden, damit junge Leute eigene Remixe herstellen und sie auf ihre Art präsentieren können.
+ (Applaus) Woher stammt nur die Empfehlung, von Politik sollte man sich fernhalten?
+ Das bedeutet nicht, dass ich versuchen will, aus Republikanern Demokraten zu machen.
+  Wir brauchen auch Republikaner.
+  Das war früher ein parteiübergreifendes Thema,
+ und in dieser Gruppe ist es das ebenfalls.
+ Werden Sie politisch aktiv!
+ Sorgen Sie dafür, dass unsere Demokratie so funktioniert, wie sie sollte.
+ Unterstützen Sie die Beschränkung von CO2-Emissionen, Umweltverschmutzung und Emissionshandel.
+ Und zwar aus folgendem Grund: Solange die USA außen vor bleiben, ist das Weltsystem kein geschlossenes System.
+  Sobald es ein geschlossenes System wird, mit Beteiligung der USA, dann wird jedes Vorstandsmitglied ...
+  Wie viele von Ihnen sitzen im Vorstand eines Unternehmens?
+ In einem geschlossenen System sind Sie gesetzlich haftbar, wenn Sie den Vorstandsvorsitzenden nicht drängen, maximales Einkommen aus der Reduzierung und dem Handel mit unvermeidlichen CO2-Emissionen zu erzielen.
+ Der Markt wird dieses Problem lösen -- wenn wir das schaffen.
+ Helfen Sie bei der groß angelegten Meinungskampagne, die im Frühling beginnt.
+ Wir müssen die Amerikaner zum Umdenken bringen.
+ Denn gegenwärtig dürfen die Politiker nicht das tun, was getan werden muss.
+ In unserem modernen Land ist es nicht mehr wie früher Aufgabe von Logik und Vernunft, zwischen Wohlstand und Macht zu vermitteln, wie es früher einmal war.
+ Heute brauchen wir viele kurze, brandaktuelle 28 - 30 Sekunden lange Fernsehspots.
+ Wir müssen eine Menge solcher Spots kaufen.
+ Wir sollten die Erderwärmung umbenennen, wie viele von Ihnen vorgeschlagen haben.
+  Mir gefällt "Klimakrise" besser als "Klimakatastrophe".
+  Aber noch einmal, ich brauche Hilfe von Leuten, die sich mit Markenentwicklung auskennen.
+ Ein Wissenschaftler sagte mal zu mir, wir stünden jetzt vor der Prüfung, ob die Kombination aus einem opponierbaren Daumen und einem Neocortex überhaupt lebensfähig ist.
+ Das stimmt tatsächlich.
+ Wie ich vorgestern bereits sagte: Das ist kein politisches Thema.
+ Noch einmal an alle Republikaner hier: Es sollte dabei nicht um Parteipolitik gehen.
+ Sie haben mehr Einfluss als einige von uns Demokraten.
+ Dies ist eine Gelegenheit.
+ Nicht nur das, sondern in Verbindung mit den anderen Ideen hier können wir einen größeren Zusammenhang herstellen.
+ Wir sind eins.
+ Ich danke Ihnen vielmals.
+ (Applaus)

tests/en.txt ADDED Viewed

	@@ -0,0 +1,155 @@

+ Thank you so much, Chris.
+ And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
+ I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.
+ And I say that sincerely, partly because (Mock sob) I need that.
+ (Laughter) Put yourselves in my position.
+ (Laughter) I flew on Air Force Two for eight years.
+ (Laughter) Now I have to take off my shoes or boots to get on an airplane!
+ (Laughter) (Applause) I'll tell you one quick story to illustrate what that's been like for me.
+ (Laughter) It's a true story -- every bit of this is true.
+ Soon after Tipper and I left the -- (Mock sob) White House -- (Laughter) we were driving from our home in Nashville to a little farm we have 50 miles east of Nashville.
+ Driving ourselves.
+ (Laughter) I know it sounds like a little thing to you, but -- (Laughter) I looked in the rear-view mirror and all of a sudden it just hit me.
+ There was no motorcade back there.
+ (Laughter) You've heard of phantom limb pain?
+ (Laughter) This was a rented Ford Taurus.
+ (Laughter) It was dinnertime, and we started looking for a place to eat.
+ We were on I-40.
+ We got to Exit 238, Lebanon, Tennessee.
+ We got off the exit, we found a Shoney's restaurant.
+ Low-cost family restaurant chain, for those of you who don't know it.
+ We went in and sat down at the booth, and the waitress came over, made a big commotion over Tipper.
+ (Laughter) She took our order, and then went to the couple in the booth next to us, and she lowered her voice so much, I had to really strain to hear what she was saying.
+ And she said "Yes, that's former Vice President Al Gore and his wife, Tipper."
+ And the man said, "He's come down a long way, hasn't he?"
+ (Laughter) (Applause) There's been kind of a series of epiphanies.
+ (Laughter) The very next day, continuing the totally true story, I got on a G-V to fly to Africa to make a speech in Nigeria, in the city of Lagos, on the topic of energy.
+ And I began the speech by telling them the story of what had just happened the day before in Nashville.
+ And I told it pretty much the same way I've just shared it with you: Tipper and I were driving ourselves, Shoney's, low-cost family restaurant chain, what the man said -- they laughed.
+ I gave my speech, then went back out to the airport to fly back home.
+ I fell asleep on the plane until, during the middle of the night, we landed on the Azores Islands for refueling.
+ I woke up, they opened the door, I went out to get some fresh air, and I looked, and there was a man running across the runway.
+  And he was waving a piece of paper, and he was yelling, "Call Washington!
+  Call Washington!"
+ And I thought to myself, in the middle of the night, in the middle of the Atlantic, what in the world could be wrong in Washington?
+ Then I remembered it could be a bunch of things.
+ (Laughter) But what it turned out to be, was that my staff was extremely upset because one of the wire services in Nigeria had already written a story about my speech, and it had already been printed in cities all across the United States of America.
+ It was printed in Monterey, I checked.
+  (Laughter) And the story began, "Former Vice President Al Gore announced in Nigeria yesterday," quote: 'My wife Tipper and I have opened a low-cost family restaurant'" -- (Laughter) "'named Shoney's, and we are running it ourselves.'"
+  (Laughter) Before I could get back to U.S. soil, David Letterman and Jay Leno had already started in on -- one of them had me in a big white chef's hat, Tipper was saying, "One more burger with fries!"
+ (Laughter) Three days later, I got a nice, long, handwritten letter from my friend and partner and colleague Bill Clinton, saying, "Congratulations on the new restaurant, Al!"
+ (Laughter) We like to celebrate each other's successes in life.
+ (Laughter) I was going to talk about information ecology.
+ But I was thinking that, since I plan to make a lifelong habit of coming back to TED, that maybe I could talk about that another time.
+ (Applause) Chris Anderson: It's a deal!
+ (Applause) Al Gore: I want to focus on what many of you have said you would like me to elaborate on: What can you do about the climate crisis?
+ I want to start with a couple of -- I'm going to show some new images, and I'm going to recapitulate just four or five.
+ Now, the slide show.
+ I update the slide show every time I give it.
+ I add new images, because I learn more about it every time I give it.
+ It's like beach-combing, you know?
+ Every time the tide comes in and out, you find some more shells.
+ Just in the last two days, we got the new temperature records in January.
+ This is just for the United States of America.
+ Historical average for Januarys is 31 degrees; last month was 39.5 degrees.
+ Now, I know that you wanted some more bad news about the environment -- I'm kidding.
+ But these are the recapitulation slides, and then I'm going to go into new material about what you can do.
+ But I wanted to elaborate on a couple of these.
+ First of all, this is where we're projected to go with the U.S. contribution to global warming, under business as usual.
+ Efficiency in end-use electricity and end-use of all energy is the low-hanging fruit.
+ Efficiency and conservation -- it's not a cost; it's a profit.
+ The sign is wrong.
+ It's not negative; it's positive.
+ These are investments that pay for themselves.
+ But they are also very effective in deflecting our path.
+ Cars and trucks -- I talked about that in the slideshow, but I want you to put it in perspective.
+ It's an easy, visible target of concern -- and it should be -- but there is more global warming pollution that comes from buildings than from cars and trucks.
+ Cars and trucks are very significant, and we have the lowest standards in the world.
+  And so we should address that.
+  But it's part of the puzzle.
+ Other transportation efficiency is as important as cars and trucks.
+ Renewables at the current levels of technological efficiency can make this much difference.
+ And with what Vinod, and John Doerr and others, many of you here -- there are a lot of people directly involved in this -- this wedge is going to grow much more rapidly than the current projection shows it.
+ Carbon Capture and Sequestration -- that's what CCS stands for -- is likely to become the killer app that will enable us to continue to use fossil fuels in a way that is safe.
+ Not quite there yet.
+  OK.
+  Now, what can you do?
+ Reduce emissions in your home.
+ Most of these expenditures are also profitable.
+ Insulation, better design.
+ Buy green electricity where you can.
+ I mentioned automobiles -- buy a hybrid.
+ Use light rail.
+ Figure out some of the other options that are much better.
+ It's important.
+ Be a green consumer.
+ You have choices with everything you buy, between things that have a harsh effect, or a much less harsh effect on the global climate crisis.
+ Consider this: Make a decision to live a carbon-neutral life.
+ Those of you who are good at branding, I'd love to get your advice and help on how to say this in a way that connects with the most people.
+ It is easier than you think.
+ It really is.
+ A lot of us in here have made that decision, and it is really pretty easy.
+ It means reduce your carbon dioxide emissions with the full range of choices that you make, and then purchase or acquire offsets for the remainder that you have not completely reduced.
+ And what it means is elaborated at climatecrisis.net.
+ There is a carbon calculator.
+ Participant Productions convened -- with my active involvement -- the leading software writers in the world, on this arcane science of carbon calculation, to construct a consumer-friendly carbon calculator.
+ You can very precisely calculate what your CO2 emissions are, and then you will be given options to reduce.
+ And by the time the movie comes out in May, this will be updated to 2.0, and we will have click-through purchases of offsets.
+ Next, consider making your business carbon-neutral.
+ Again, some of us have done that, and it's not as hard as you think.
+ Integrate climate solutions into all of your innovations, whether you are from the technology, or entertainment, or design and architecture community.
+ Invest sustainably.
+ Majora mentioned this.
+ Listen, if you have invested money with managers who you compensate on the basis of their annual performance, don't ever again complain about quarterly report CEO management.
+ Over time, people do what you pay them to do.
+ And if they judge how much they're going to get paid on your capital that they've invested, based on the short-term returns, you're going to get short-term decisions.
+ A lot more to be said about that.
+ Become a catalyst of change.
+ Teach others, learn about it, talk about it.
+ The movie is a movie version of the slideshow I gave two nights ago, except it's a lot more entertaining.
+ And it comes out in May.
+ Many of you here have the opportunity to ensure that a lot of people see it.
+ Consider sending somebody to Nashville.
+ Pick well.
+ And I am personally going to train people to give this slideshow -- re-purposed, with some of the personal stories obviously replaced with a generic approach, and it's not just the slides, it's what they mean.
+ And it's how they link together.
+ And so I'm going to be conducting a course this summer for a group of people that are nominated by different folks to come and then give it en masse, in communities all across the country, and we're going to update the slideshow for all of them every single week, to keep it right on the cutting edge.
+ Working with Larry Lessig, it will be, somewhere in that process, posted with tools and limited-use copyrights, so that young people can remix it and do it in their own way.
+ (Applause) Where did anybody get the idea that you ought to stay arm's length from politics?
+ It doesn't mean that if you're a Republican, that I'm trying to convince you to be a Democrat.
+ We need Republicans as well.
+ This used to be a bipartisan issue, and I know that in this group it really is.
+ Become politically active.
+ Make our democracy work the way it's supposed to work.
+ Support the idea of capping carbon dioxide emissions -- global warming pollution -- and trading it.
+ Here's why: as long as the United States is out of the world system, it's not a closed system.
+ Once it becomes a closed system, with U.S. participation, then everybody who's on a board of directors -- how many people here serve on the board of directors of a corporation?
+ Once it's a closed system, you will have legal liability if you do not urge your CEO to get the maximum income from reducing and trading the carbon emissions that can be avoided.
+ The market will work to solve this problem -- if we can accomplish this.
+ Help with the mass persuasion campaign that will start this spring.
+ We have to change the minds of the American people.
+ Because presently, the politicians do not have permission to do what needs to be done.
+ And in our modern country, the role of logic and reason no longer includes mediating between wealth and power the way it once did.
+ It's now repetition of short, hot-button, 30-second, 28-second television ads.
+ We have to buy a lot of those ads.
+ Let's re-brand global warming, as many of you have suggested.
+ I like "climate crisis" instead of "climate collapse," but again, those of you who are good at branding, I need your help on this.
+ Somebody said the test we're facing now, a scientist told me, is whether the combination of an opposable thumb and a neocortex is a viable combination.
+ (Laughter) That's really true.
+ I said the other night, and I'll repeat now: this is not a political issue.
+ Again, the Republicans here -- this shouldn't be partisan.
+ You have more influence than some of us who are Democrats do.
+ This is an opportunity.
+ Not just this, but connected to the ideas that are here, to bring more coherence to them.
+ We are one.
+ Thank you very much, I appreciate it.
+ (Applause)

tests/freeze_tokenizer.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Helper script do dump/freeze the current tokenizer"""
+from tokenizers import Tokenizer
+from pathlib import Path
+HERE = Path(__file__).parent.absolute()
+def main():
+    MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+    tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
+    tokenizer.save((HERE.parent / "iscc_sct/tokenizer.json").as_posix(), pretty=False)
+if __name__ == "__main__":
+    main()

tests/test_cli.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import subprocess
+import pytest
+import shutil
+sct = shutil.which("sct")
+@pytest.fixture
+def sample_text_file(tmp_path):
+    file_path = tmp_path / "sample.txt"
+    file_path.write_text("This is a sample text for testing.")
+    return file_path
+@pytest.fixture
+def empty_text_file(tmp_path):
+    file_path = tmp_path / "empty.txt"
+    file_path.write_text(" ")
+    return file_path
+@pytest.fixture
+def non_utf8_text_file(tmp_path):
+    file_path = tmp_path / "non_utf8.txt"
+    file_path.write_text("Iñtërnâtiônàlizætiøn☃", encoding="utf-16")
+    return file_path
+def test_cli_no_args():
+    result = subprocess.run([sct], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "Generate Semantic" in result.stdout
+def test_cli_empty_file(empty_text_file):
+    result = subprocess.run([sct, str(empty_text_file), "-d"], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "SKIPPED" in result.stderr
+def test_cli_non_utf8_file(non_utf8_text_file):
+    result = subprocess.run([sct, str(non_utf8_text_file), "-d"], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "Could not decode" in result.stderr
+    assert "ISCC:" in result.stdout
+def test_cli_generate_sct(sample_text_file):
+    result = subprocess.run([sct, str(sample_text_file)], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "ISCC:" in result.stdout
+def test_cli_generate_sct_granular(sample_text_file):
+    result = subprocess.run([sct, str(sample_text_file), "--granular"], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "features" in result.stdout
+def test_cli_debug_mode(sample_text_file):
+    result = subprocess.run([sct, str(sample_text_file), "--debug"], capture_output=True, text=True)
+    assert result.returncode == 0
+    assert "DEBUG" in result.stderr

tests/test_demo.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from iscc_sct.demo import (
+    compute_iscc_code,
+    compare_codes,
+    hamming_to_cosine,
+    generate_similarity_bar,
+)
+def test_compute_iscc_code():
+    text1 = "Hello, world!"
+    text2 = "Hallo, Welt!"
+    bit_length = 64
+    result = compute_iscc_code(text1, text2, bit_length)
+    assert len(result) == 3
+    assert all(isinstance(code, str) for code in result[:2])
+    assert isinstance(result[2], str)
+def test_compare_codes():
+    code_a = "ISCC:EAAQCVG2TABD6"
+    code_b = "ISCC:EAAQCVG2TABD6"
+    bits = 64
+    result = compare_codes(code_a, code_b, bits)
+    assert isinstance(result, str)
+    assert "100.00%" in result
+    result = compare_codes(None, code_b, bits)
+    assert result is None
+def test_hamming_to_cosine():
+    assert hamming_to_cosine(0, 64) == 1.0
+    assert hamming_to_cosine(32, 64) == 0.0
+    assert hamming_to_cosine(64, 64) == -1.0
+def test_generate_similarity_bar():
+    result = generate_similarity_bar(1.0)
+    assert "100.00%" in result
+    assert "green" in result
+    result = generate_similarity_bar(-0.5)
+    assert "-50.00%" in result
+    assert "red" in result
+from unittest.mock import patch, MagicMock
+import gradio as gr
+from iscc_sct.demo import process_text
+@patch("iscc_sct.demo.sct.gen_text_code_semantic")
+def test_process_text(mock_gen_text_code):
+    mock_gen_text_code.return_value = {"iscc": "ISCC:EAAQCVG2TABD6"}
+    # Test with valid input
+    result = process_text("Hello, world!", 64, "a")
+    assert isinstance(result, dict)
+    assert len(result) == 1
+    key, value = next(iter(result.items()))
+    assert isinstance(key, gr.components.Textbox)
+    assert isinstance(value, gr.components.Textbox)
+    assert value.value == "ISCC:EAAQCVG2TABD6"
+    # Test with empty input
+    result = process_text("", 64, "b")
+    assert result is None
+    # Test with different bit length
+    process_text("Test", 128, "a")
+    mock_gen_text_code.assert_called_with("Test", bits=128)
+    # Test with different suffix
+    result = process_text("Test", 64, "b")
+    assert len(result) == 1
+    key, value = next(iter(result.items()))
+    assert isinstance(key, gr.components.Textbox)
+    assert isinstance(value, gr.components.Textbox)

tests/test_iscc_sct.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from pathlib import Path
+import pytest
+from blake3 import blake3
+import iscc_sct as sct
+from iscc_sct.code_semantic_text import (
+    split_text,
+    tokenize_chunks,
+    embed_tokens,
+    embed_chunks,
+    compress,
+)
+import numpy as np
+HERE = Path(__file__).parent.absolute()
+TEXT = """
+`iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
+[ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
+designed to capture and represent the language agnostic semantic content of text for improved
+similarity detection.
+The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
+near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
+capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
+engineered to be robust against a broader range of variations and translations of text that cannot
+be matched based on lexical similarity.
+"""
+def test_version():
+    assert sct.__version__ == "0.1.2"
+def test_code_text_semantic_default():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp)
+    assert result == {
+        "iscc": "ISCC:CAA636IXQD736IGJ",
+        "characters": 12076,
+    }
+def test_code_text_semantic_no_chars():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, characters=False)
+    assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}
+def test_code_text_semantic_embedding():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, embedding=True)
+    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
+    assert len(result["features"][0]["embedding"]) == 384
+def test_code_text_semantic_features():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, simprints=True)
+    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
+    assert result["characters"] == 12076
+    assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
+    assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]
+def test_code_text_semantic_offsets():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, offsets=True)
+    assert result["features"][0]["offsets"][:3] == [0, 277, 612]
+def test_code_text_semantic_chunks():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, contents=True)
+    assert len(result["features"][0]["contents"]) == 39
+    assert result["features"][0]["contents"][0].startswith("\n Thank ")
+    assert result["features"][0]["contents"][-1].endswith("(Applause)\n")
+def test_code_text_semantic_sizes():
+    fp = HERE / "en.txt"
+    result = sct.code_text_semantic(fp, sizes=True)
+    # fmt: off
+    assert result["features"][0]["sizes"] == [
+        440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
+        477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
+        312, 455, 413
+    ]
+    # fmt: on
+def test_gen_text_code_semantic_empty():
+    with pytest.raises(ValueError) as excinfo:
+        sct.gen_text_code_semantic("")
+    assert str(excinfo.value) == "Input text cannot be empty."
+def test_gen_text_code_semantic_granular():
+    result = sct.gen_text_code_semantic(
+        TEXT,
+        simprints=True,
+        offsets=True,
+        contents=True,
+    )
+    assert (
+        result
+        == {
+            "characters": 726,
+            "iscc": "ISCC:CAARISHPJHEXQAYL",
+            "features": [
+                {
+                    "maintype": "semantic",
+                    "subtype": "text",
+                    "version": 0,
+                    "simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
+                    "offsets": [0, 297],
+                    "contents": [
+                        "\n"
+                        "`iscc-sct` is a **proof of concept implementation** of a semantic "
+                        "Text-Code for the\n"
+                        "[ISCC](https://core.iscc.codes) (*International Standard Content "
+                        "Code*). Semantic Text-Codes are\n"
+                        "designed to capture and represent the language agnostic semantic "
+                        "content of text for improved\n"
+                        "similarity detection.\n"
+                        "\n",  # NOTE: end of first chunk (see comma :)
+                        "\n"
+                        "\n"
+                        "The ISCC framework already comes with a Text-Code that is based "
+                        "on lexical similarity and can match\n"
+                        "near duplicates. The ISCC Semantic Text-Code is planned as a new "
+                        "additional ISCC-UNIT focused on\n"
+                        "capturing a more abstract and broad semantic similarity. As such "
+                        "the Semantic Text-Code is\n"
+                        "engineered to be robust against a broader range of variations and "
+                        "translations of text that cannot\n"
+                        "be matched based on lexical similarity.\n",
+                    ],
+                }
+            ],
+        }
+    )
+def test_gen_text_code_semantic_checks_bits():
+    with pytest.raises(ValueError):
+        sct.gen_text_code_semantic("Test", bits=99)
+def test_split_text(text_en):
+    chunks = split_text(text_en)
+    assert chunks[0][1][:8] == "\n Thank "
+    assert chunks[-1][1][:8] == "\n (Laugh"
+def test_split_text_override():
+    text = "Try some very small and granular text splitting. Use options override for it."
+    chunks = split_text(text, max_tokens=8, overlap=4)
+    assert chunks == [
+        (0, "Try some very small and granular text "),
+        (20, "and granular text splitting. "),
+        (49, "Use options override for it."),
+    ]
+def test_tokenize_chunks():
+    chunks = ["Hello World", "These are chunks"]
+    result = tokenize_chunks(chunks)
+    np.testing.assert_array_equal(
+        result["input_ids"],
+        np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
+    )
+def test_embed_tokens():
+    chunks = ["Hello World", "These are chunks"]
+    tokens = tokenize_chunks(chunks)
+    embeddings = embed_tokens(tokens)
+    assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)
+def test_embed_chunks():
+    chunks = ["Hello World"]
+    expected = [0.008697219, 0.038051583, 0.043976285]
+    embeddings = embed_chunks(chunks)
+    assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)
+def test_gen_text_code_semantic(text_en):
+    result = sct.gen_text_code_semantic(text_en, embedding=True)
+    assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
+    assert result["features"][0]["embedding"][:3] == pytest.approx(
+        [0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
+        rel=1e-3,
+    )
+def test_cross_lingual_match(text_en, text_de):
+    a = sct.gen_text_code_semantic(text_en)["iscc"]
+    assert a == "ISCC:CAA636IXQD736IGJ"
+    b = sct.gen_text_code_semantic(text_de)["iscc"]
+    assert b == "ISCC:CAA636IXQD4TMIGL"  # hamming distance for the codes is 6 bits
+def test_tokenizer_integrity(text_en):
+    # test if updates break tokenizer compatibility
+    hasher = blake3()
+    for idx, chunk in split_text(text_en):
+        hasher.update(chunk.encode("utf-8"))
+    checksum = hasher.hexdigest()
+    assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"
+def test_soft_hash_text_semantic():
+    result = sct.soft_hash_text_semantic("Hello World")
+    assert (
+        result.hex()
+        == "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
+    )
+def test_shift_resistance(text_en):
+    a = sct.soft_hash_text_semantic(text_en)
+    shifted = "Just put another sentence in the begginging of the text!\n" + text_en
+    b = sct.soft_hash_text_semantic(shifted)
+    # TODO improve algorithm with more shift resistant semantic chunking
+    # On 256-bit code
+    assert sct.hamming_distance(a, b) == 6
+    # On 64-bit code
+    assert sct.hamming_distance(b[:16], a[:16]) == 1
+def test_compress():
+    arr1 = np.array([3.0, 15294.7789, 32977.7])
+    arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
+    expected = [3.0, 15294.8, 32977.7]
+    assert compress(arr1, 1) == expected
+    assert compress(arr2, 1) == expected
+def test_embedding_precision():
+    d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
+    assert d16["features"][0]["embedding"][0] == 0.0087

tests/test_main.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import iscc_sct as sct
+def test_create_returns_sct_meta():
+    result = sct.create("Hello World")
+    assert isinstance(result, sct.Metadata)
+def test_create_default():
+    result = sct.create("Hello World")
+    assert result == sct.Metadata(iscc="ISCC:CAA7GZ4J3DI3XY2R", characters=11)
+def test_create_granular():
+    result = sct.create("Hello World", granular=True)
+    assert result.model_dump(exclude_none=True) == {
+        "iscc": "ISCC:CAA7GZ4J3DI3XY2R",
+        "characters": 11,
+        "features": [
+            {
+                "maintype": "semantic",
+                "subtype": "text",
+                "version": 0,
+                "simprints": [{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}],
+            }
+        ],
+    }
+def test_create_embedding():
+    result = sct.create("Hello World", embedding=True)
+    assert len(result.features[0].embedding) == 384

tests/test_models.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import pytest
+from pydantic import ValidationError
+from iscc_sct.models import Metadata, Feature, FeatureSet
+def test_feature_initialization():
+    # Test empty initialization
+    with pytest.raises(ValidationError):
+        Feature()
+    feature = Feature(simprint="XZjeSfdyVi0")
+    assert feature.simprint == "XZjeSfdyVi0"
+    assert feature.offset is None
+    assert feature.content is None
+    # Test initialization with values
+    feature = Feature(simprint="feature", offset=5, content="example text")
+    assert feature.simprint == "feature"
+    assert feature.offset == 5
+    assert feature.content == "example text"
+def test_feature_set_initialization():
+    fs = FeatureSet()
+    assert fs.model_dump(exclude_none=True) == {"maintype": "semantic", "subtype": "text", "version": 0}
+def test_sct_meta_initialization():
+    # Test initialization with minimal required fields
+    meta = Metadata(iscc="ISCC1234567890")
+    assert meta.iscc == "ISCC1234567890"
+    assert meta.characters is None
+    assert meta.features is None
+    # Test initialization with all fields
+    features = [FeatureSet(simprints=[Feature(simprint="feature1", offset=0, content="text1")], embedding=[0.1, 0.2])]
+    meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
+    assert meta.iscc == "ISCC1234567890"
+    assert meta.characters == 1000
+    assert meta.features == features
+    assert meta.features[0].embedding == [0.1, 0.2]
+def test_metadata_to_index_format():
+    # Test conversion from Object-Format to Index-Format
+    features = [
+        FeatureSet(
+            simprints=[
+                Feature(simprint="feature1", offset=0, size=5, content="text1"),
+                Feature(simprint="feature2", offset=5, size=5, content="text2"),
+            ]
+        )
+    ]
+    meta = Metadata(iscc="ISCC1234567890", features=features)
+    index_meta = meta.to_index_format()
+    assert isinstance(index_meta.features[0].simprints[0], str)
+    assert index_meta.features[0].simprints == ["feature1", "feature2"]
+    assert index_meta.features[0].offsets == [0, 5]
+    assert index_meta.features[0].sizes == [5, 5]
+    assert index_meta.features[0].contents == ["text1", "text2"]
+    # Test that Index-Format remains unchanged
+    index_meta2 = index_meta.to_index_format()
+    assert index_meta2.model_dump() == index_meta.model_dump()
+def test_metadata_to_object_format():
+    # Test conversion from Index-Format to Object-Format
+    features = [
+        FeatureSet(simprints=["feature1", "feature2"], offsets=[0, 5], sizes=[5, 5], contents=["text1", "text2"])
+    ]
+    meta = Metadata(iscc="ISCC1234567890", features=features)
+    object_meta = meta.to_object_format()
+    assert isinstance(object_meta.features[0].simprints[0], Feature)
+    assert object_meta.features[0].simprints[0].simprint == "feature1"
+    assert object_meta.features[0].simprints[0].offset == 0
+    assert object_meta.features[0].simprints[0].size == 5
+    assert object_meta.features[0].simprints[0].content == "text1"
+    assert object_meta.features[0].offsets is None
+    assert object_meta.features[0].sizes is None
+    assert object_meta.features[0].contents is None
+    # Test that Object-Format remains unchanged
+    object_meta2 = object_meta.to_object_format()
+    assert object_meta2.model_dump() == object_meta.model_dump()
+def test_metadata_to_index_format_with_none_simprints():
+    # Test conversion when feature_set.simprints is None
+    features = [FeatureSet(simprints=None, embedding=[0.1, 0.2])]
+    meta = Metadata(iscc="ISCC1234567890", features=features)
+    index_meta = meta.to_index_format()
+    assert index_meta.features[0].simprints is None
+    assert index_meta.features[0].embedding == [0.1, 0.2]
+    assert index_meta.model_dump() == meta.model_dump()
+def test_metadata_format_conversion_with_no_features():
+    meta = Metadata(iscc="ISCC1234567890")
+    index_meta = meta.to_index_format()
+    object_meta = meta.to_object_format()
+    assert index_meta.model_dump() == meta.model_dump()
+    assert object_meta.model_dump() == meta.model_dump()

tests/test_readme.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import doctest
+from pathlib import Path
+README = Path(__file__).parent.parent / "README.md"
+def test_readme_examples():
+    failure_count, test_count = doctest.testfile(
+        README.as_posix(), module_relative=False, optionflags=doctest.ELLIPSIS, raise_on_error=False
+    )
+    assert failure_count == 0, f"{failure_count} out of {test_count} doctests failed"

tests/test_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import pytest
+import iscc_sct as sct
+from iscc_sct import utils
+from blake3 import blake3
+def test_check_integrity(tmp_path):
+    # Create a temporary file with known content
+    file_path = tmp_path / "testfile.txt"
+    content = "This is a test file."
+    with open(file_path, "w") as f:
+        f.write(content)
+    # Generate a correct checksum and then alter it to simulate failure
+    hasher = blake3()
+    hasher.update(content.encode())
+    correct_checksum = hasher.hexdigest()
+    assert utils.check_integrity(file_path, correct_checksum) == file_path
+    wrong_checksum = correct_checksum + "wrong"  # Deliberately incorrect checksum
+    # Test the function with the wrong checksum
+    with pytest.raises(RuntimeError) as exc_info:
+        utils.check_integrity(file_path, wrong_checksum)
+    # Check that the exception message contains expected text
+    assert "Failed integrity check" in str(exc_info.value)
+def test_hamming_distance_identical():
+    a = b"abc"
+    b = b"abc"
+    assert utils.hamming_distance(a, b) == 0
+def test_hamming_distance_different():
+    a = b"abc"
+    b = b"abd"
+    assert utils.hamming_distance(a, b) == 3
+def test_hamming_distance_completely_different():
+    a = b"\x00"
+    b = b"\xff"
+    assert utils.hamming_distance(a, b) == 8
+def test_hamming_distance_raises_value_error():
+    a = b"abc"
+    b = b"abcd"
+    with pytest.raises(ValueError):
+        utils.hamming_distance(a, b)
+def test_encode_decode_base32():
+    original = b"Hello, World!"
+    encoded = utils.encode_base32(original)
+    assert isinstance(encoded, str)
+    assert encoded == "JBSWY3DPFQQFO33SNRSCC"
+    decoded = utils.decode_base32(encoded)
+    assert isinstance(decoded, bytes)
+    assert decoded == original
+def test_encode_decode_base64():
+    original = b"Hello, World!"
+    encoded = utils.encode_base64(original)
+    assert isinstance(encoded, str)
+    assert encoded == "SGVsbG8sIFdvcmxkIQ"
+    decoded = utils.decode_base64(encoded)
+    assert isinstance(decoded, bytes)
+    assert decoded == original
+def test_encode_decode_edge_cases():
+    # Test empty input
+    assert utils.encode_base32(b"") == ""
+    assert utils.decode_base32("") == b""
+    assert utils.encode_base64(b"") == ""
+    assert utils.decode_base64("") == b""
+    # Test input with padding
+    original = b"a"
+    assert utils.decode_base32(utils.encode_base32(original)) == original
+    assert utils.decode_base64(utils.encode_base64(original)) == original
+def test_iscc_distance_different_lengths():
+    iscc1 = sct.create("Hello", bits=64).iscc
+    iscc2 = sct.create("Hello", bits=96).iscc
+    with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
+        utils.iscc_distance(iscc1, iscc2)

tests/visualize.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from html import escape
+import iscc_sct as ict
+def generate_html(fingerprint_data):
+    chunks = fingerprint_data["features"]
+    # Sort chunks by offset
+    chunks.sort(key=lambda x: x["offset"])
+    html_content = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>Text Fingerprint Visualization</title>
+        <script src="https://cdn.tailwindcss.com"></script>
+    </head>
+    <body class="bg-gray-100 p-8">
+        <div class="max-w-4xl mx-auto bg-white p-6 rounded-lg shadow-lg">
+            <h1 class="text-2xl font-bold mb-4">Text Fingerprint Visualization</h1>
+            <div class="text-sm mb-4">
+                <span class="font-semibold">ISCC:</span> {fingerprint_data['iscc']}
+            </div>
+            <div class="text-sm mb-4">
+                <span class="font-semibold">Characters:</span> {fingerprint_data['characters']}
+            </div>
+            <div class="relative text-base leading-relaxed whitespace-pre-wrap">
+    """
+    chunk_color = "bg-yellow-100"
+    overlap_color = "bg-red-100"
+    current_pos = 0
+    for i, chunk in enumerate(chunks):
+        start = max(chunk["offset"], current_pos)
+        end = chunk["offset"] + chunk["size"]
+        if start < end:
+            # Function to escape text and preserve line breaks
+            def escape_and_preserve_breaks(text):
+                return escape(text).replace("\n", "<br>")
+            # Non-overlapping part
+            html_content += f'<span class="{overlap_color}">{escape_and_preserve_breaks(chunk["text"][current_pos - chunk["offset"]:start - chunk["offset"]])}'
+            # Overlapping part (if any)
+            if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
+                overlap_end = chunks[i + 1]["offset"]
+                html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
+                html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :])
+            else:
+                html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
+            # Fingerprint badge
+            html_content += f'<span class="inline-block bg-gray-800 text-white text-xs px-2 py-1 rounded ml-1">{chunk["feature"]}</span>'
+            html_content += "</span>"
+            current_pos = end
+    html_content += """
+            </div>
+        </div>
+    </body>
+    </html>
+    """
+    return html_content
+def main():
+    with open("../README.md", "rb") as f:
+        data = f.read()
+    text = data.decode("utf-8")
+    result = ict.create(text, granular=True)
+    print(result.model_dump())
+    # Generate the HTML content
+    html_content = generate_html(result.model_dump())
+    # Write the HTML content to a file
+    with open("readme.html", "wt", encoding="utf-8") as f:
+        f.write(html_content)
+if __name__ == "__main__":
+    main()