titusz commited on
Commit
b31f748
1 Parent(s): ce4efbf

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.editorconfig ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # see http://editorconfig.org
2
+
3
+ # Top-level config
4
+ root = true
5
+
6
+
7
+ # All files
8
+ [*]
9
+ charset = utf-8
10
+ indent_style = space
11
+ indent_size = 4
12
+ end_of_line = lf
13
+ insert_final_newline = true
14
+ trim_trailing_whitespace = true
15
+ max_line_length = 119
16
+
17
+
18
+ # YAML files
19
+ [*.{yml,yaml}]
20
+ indent_size = 2
CHANGELOG.md ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Changelog
2
+
3
+ ## [0.1.2] - Unreleased
4
+ - Encode granular features with base64
5
+ - Refactor result format to generic ISCC data model
6
+ - Add optional gradio GUI demo
7
+
8
+ ## [0.1.1] - 2024-06-25
9
+ - Handle text decoding errors gracefully
10
+ - Handle feature bit-lengths independently
11
+ - Improve model load time
12
+ - Improve memory use with batched embedding
13
+
14
+ ## [0.1.0] - 2024-06-25
15
+ - Initial pre-release
LICENSE ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # LICENSE
2
+
3
+ This work is licensed under the Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.
4
+
5
+ To view a copy of this license, visit https://creativecommons.org/licenses/by-nc-sa/4.0/ or send a letter to
6
+ Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
7
+
8
+ ## Attribution
9
+
10
+ Titusz Pan, ISCC Foundation - 2024
iscc_sct/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ __version__ = "0.1.2"
2
+ from iscc_sct.options import *
3
+ from iscc_sct.utils import *
4
+ from iscc_sct.code_semantic_text import *
5
+ from iscc_sct.models import *
6
+ from iscc_sct.main import *
iscc_sct/cli.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ from pathlib import Path
4
+ from loguru import logger
5
+ from iscc_sct.main import create
6
+ from charset_normalizer import from_bytes
7
+
8
+
9
+ def main():
10
+ parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
11
+ parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
12
+ parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
13
+ parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
14
+ parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
15
+ args = parser.parse_args()
16
+
17
+ if args.path is None:
18
+ parser.print_help()
19
+ return
20
+
21
+ if not args.debug:
22
+ logger.remove()
23
+
24
+ for path in glob.glob(args.path):
25
+ path = Path(path)
26
+ if path.is_file():
27
+ logger.debug(f"Processing {path.name}")
28
+ with path.open("rb") as file:
29
+ data = file.read()
30
+ try:
31
+ text = data.decode("utf-8")
32
+ if not text.strip():
33
+ logger.warning(f"SKIPPED empty: {path}")
34
+ continue
35
+ except UnicodeDecodeError:
36
+ logger.debug(f"Could not decode {path.name} as UTF-8.")
37
+ charset_match = from_bytes(data).best()
38
+ if not charset_match: # pragma: no cover
39
+ logger.error(f"SKIPPING {path.name} - failed to detect text encoding")
40
+ continue
41
+ logger.debug(f"Decode {path.name} with {charset_match.encoding}.")
42
+ text = str(charset_match)
43
+ sct_meta = create(text, granular=args.granular, bits=args.bits)
44
+ if args.granular:
45
+ print(repr(sct_meta))
46
+ else:
47
+ print(sct_meta.iscc)
48
+
49
+
50
+ if __name__ == "__main__": # pragma: no cover
51
+ main()
iscc_sct/code_semantic_text.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """*A cross-lingual semantic similarity preserving hash for plain-text content (soft hash).*
3
+
4
+ The ISCC Text-Code Semantic is a content-based compact binary code generated from multilingual text.
5
+
6
+ !!! Warning
7
+
8
+ This is a non-standard Proof of Concept implementation.
9
+ Plain-text extraction from documents in various formats (especially PDF) may
10
+ yield different results depending on the extraction tools being used.
11
+ The [iscc-sdk](https://github.com/iscc/iscc-sdk) uses [Apache Tika](https://tika.apache.org)
12
+ to extract text from documents for Text-Code generation.
13
+
14
+ **Algorithm overview**
15
+
16
+ - Split text into semantically coherent overlapping chunks.
17
+ - Create vector embeddings of the chunks.
18
+ - Average and binarize the chunk embeddings.
19
+ - Encode as ISCC-UNIT of MainType SEMANTIC and SubType TEXT
20
+ """
21
+
22
+ from loguru import logger as log
23
+ from onnxruntime.capi.onnxruntime_pybind11_state import NoSuchFile
24
+ from semantic_text_splitter import TextSplitter
25
+ from tokenizers import Tokenizer
26
+ from pathlib import Path
27
+ from typing import Any
28
+ import numpy as np
29
+ import onnxruntime as rt
30
+ from numpy.typing import NDArray
31
+ from functools import cache
32
+ import iscc_sct as sct
33
+
34
+
35
+ HERE = Path(__file__).parent.absolute()
36
+
37
+
38
+ __all__ = [
39
+ "code_text_semantic",
40
+ "gen_text_code_semantic",
41
+ "soft_hash_text_semantic",
42
+ "embed_chunks",
43
+ ]
44
+
45
+ BIT_LEN_MAP = {
46
+ 32: "0000",
47
+ 64: "0001",
48
+ 96: "0010",
49
+ 128: "0011",
50
+ 160: "0100",
51
+ 192: "0101",
52
+ 224: "0110",
53
+ 256: "0111",
54
+ }
55
+
56
+
57
+ TOKENIZER_PATH = HERE / "tokenizer.json"
58
+ MAINTYPE = "0001" # SEMANTIC
59
+ SUBTYPE = "0000" # TEXT
60
+ SCT_VERSION = "0000" # V0
61
+
62
+
63
+ def code_text_semantic(fp, **options):
64
+ # type: (Path|str, Any) -> dict[str, Any]
65
+ """
66
+ Generate ISCC Semantic-Code Text from a text file.
67
+
68
+ NOTE:
69
+ If you enable generating granular features with `features=True` those features will have
70
+ the same bit-length as the generated ISCC-UNIT.
71
+
72
+ :param fp: File path of plaintext file to process
73
+ :param options: Custom processing options for overriding global options
74
+ :key bits (int): Length of generated Semantic Text-Code in bits (default 64)
75
+ :key characters (bool): Return document character count (default True).
76
+ :key embedding (bool): Return global document embedding (default False).
77
+ :key precision (int): Max fractional digits for embeddings (default 8).
78
+ :key features (bool): Return granular document features (default False).
79
+ :key offsets (bool): Return character offsets for granular features (default False).
80
+ :key chunks (bool): Return text chunks (default False).
81
+ :key max_tokens (int): Max tokens per chunk (default 127).
82
+ :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
83
+ :key trim (int): Trim whitespace from chunks (default False).
84
+ :return: Dict with ISCC processing results
85
+ """
86
+ fp = Path(fp)
87
+ return gen_text_code_semantic(fp.read_text(encoding="utf-8"), **options)
88
+
89
+
90
+ def gen_text_code_semantic(text, **options):
91
+ # type: (str, Any) -> dict
92
+ """
93
+ Create an ISCC Semantic-Code Text from plaintext.
94
+
95
+ :param str text: Plaint text for ISCC processing
96
+ :param options: Custom processing options for overriding global options
97
+ :key bits (int): Length of generated Semantic Text-Code in bits (default 64)
98
+ :key characters (bool): Return document character count (default True).
99
+ :key embedding (bool): Return global document embedding (default False).
100
+ :key precision (int): Max fractional digits for embeddings (default 8).
101
+ :key features (bool): Return granular document features (default False).
102
+ :key offsets (bool): Return character offsets for granular features (default False).
103
+ :key chunks (bool): Return text chunks (default False).
104
+ :key max_tokens (int): Max tokens per chunk (default 127).
105
+ :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
106
+ :key trim (int): Trim whitespace from chunks (default False).
107
+ :return: Dict with ISCC processing results (using Index-Format for granular features)
108
+ """
109
+
110
+ if not text:
111
+ raise ValueError("Input text cannot be empty.")
112
+
113
+ opts = sct.sct_opts.override(options)
114
+
115
+ result = {"iscc": None} # Initialize first so `iscc` key is "first" in dict
116
+
117
+ if opts.characters:
118
+ result["characters"] = len(text)
119
+
120
+ # Text splitting
121
+ splits = split_text(text, **opts.model_dump())
122
+ offsets, chunks = [list(item) for item in zip(*splits)]
123
+
124
+ # Chunk embedding
125
+ with sct.timer("EMBEDDING time"):
126
+ embeddings = embed_chunks(chunks)
127
+
128
+ # Create global document embedding
129
+ embedding = mean_pooling(embeddings)
130
+
131
+ if any([opts.simprints, opts.offsets, opts.sizes, opts.contents, opts.embedding]):
132
+ feature_set = {
133
+ "maintype": "semantic",
134
+ "subtype": "text",
135
+ "version": 0,
136
+ }
137
+ if opts.embedding:
138
+ feature_set["embedding"] = compress(embedding, opts.precision)
139
+ if opts.simprints:
140
+ feature_digests = [binarize(vec)[: opts.bits_granular // 8] for vec in embeddings]
141
+ feature_set["simprints"] = [sct.encode_base64(digest) for digest in feature_digests]
142
+ if opts.offsets:
143
+ feature_set["offsets"] = offsets
144
+ if opts.sizes:
145
+ feature_set["sizes"] = [len(chunk) for chunk in chunks]
146
+ if opts.contents:
147
+ feature_set["contents"] = chunks
148
+ result["features"] = [feature_set]
149
+
150
+ # Encode global document embedding
151
+ length = BIT_LEN_MAP[opts.bits]
152
+ header = int(MAINTYPE + SUBTYPE + SCT_VERSION + length, 2).to_bytes(2, byteorder="big")
153
+ digest = binarize(embedding)[: opts.bits // 8]
154
+ code = sct.encode_base32(header + digest)
155
+ result["iscc"] = "ISCC:" + code
156
+ return result
157
+
158
+
159
+ def soft_hash_text_semantic(text):
160
+ # type: (str) -> bytes
161
+ """Creates a 256-bit semantic similarity preserving hash for text input."""
162
+ chunks = [item[1] for item in split_text(text)]
163
+ embeddings = embed_chunks(chunks)
164
+ embedding = mean_pooling(embeddings)
165
+ digest = binarize(embedding)
166
+ return digest
167
+
168
+
169
+ def split_text(text, **options):
170
+ # type: (str) -> list[tuple[int,str]]
171
+ """
172
+ Split text into semantically coherent chunks for embedding.
173
+
174
+ :param text: Text to split.
175
+ :param options: Custom processing options for overriding global options
176
+ :key max_tokens (int): Max tokens per chunk (default 127).
177
+ :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
178
+ :key trim (int): Trim whitespace from chunks (default False).
179
+ :return: A list of offset, chunk tuples [(offset,chunk), ...]
180
+ """
181
+ opts = sct.sct_opts.override(options)
182
+ return splitter(**opts.model_dump()).chunk_indices(text)
183
+
184
+
185
+ @cache
186
+ def tokenizer():
187
+ # type: () -> Tokenizer
188
+ """
189
+ Load and cache the tokenizer model based on the predefined model name.
190
+
191
+ :return: An instance of the Tokenizer.
192
+ """
193
+ with sct.timer("TOKENIZER load time"):
194
+ return Tokenizer.from_file(TOKENIZER_PATH.as_posix())
195
+
196
+
197
+ @cache
198
+ def splitter(**options):
199
+ # type: (Any) -> TextSplitter
200
+ """
201
+ Load and cache the text splitter, initialized with tokenizer.
202
+
203
+ :param options: Custom processing options for overriding global options
204
+ :key max_tokens (int): Max tokens per chunk (default 127).
205
+ :key overlap (int): Max tokens allowed to overlap between chunks (default 48).
206
+ :key trim (int): Trim whitespace from chunks (default False).
207
+ :return: An instance of TextSplitter.
208
+ """
209
+ opts = sct.sct_opts.override(options)
210
+ with sct.timer("TEXTSPLITTER load time"):
211
+ return TextSplitter.from_huggingface_tokenizer(
212
+ tokenizer(), capacity=opts.max_tokens, overlap=opts.overlap, trim=opts.trim
213
+ )
214
+
215
+
216
+ @cache
217
+ def model():
218
+ # type: () -> rt.InferenceSession
219
+ """
220
+ Load and cache the ONNX inference model from a specified path.
221
+
222
+ :return: An ONNX inference session.
223
+ """
224
+ available_onnx_providers = rt.get_available_providers()
225
+ log.debug(f"Available ONNX providers {', '.join(available_onnx_providers)}")
226
+ selected_onnx_providers = ["CPUExecutionProvider"]
227
+ if "CUDAExecutionProvider" in available_onnx_providers: # pragma: no cover
228
+ selected_onnx_providers.insert(0, "CUDAExecutionProvider")
229
+ log.debug(f"Using ONNX providers {', '.join(selected_onnx_providers)}")
230
+ so = rt.SessionOptions()
231
+ so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
232
+ try:
233
+ with sct.timer("ONNXMODEL load time"):
234
+ return rt.InferenceSession(sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers)
235
+ except NoSuchFile: # pragma: no cover
236
+ with sct.timer("ONNXMODEL aquisition/load time"):
237
+ model_path = sct.get_model()
238
+ return rt.InferenceSession(model_path, sess_options=so, providers=selected_onnx_providers)
239
+
240
+
241
+ def tokenize_chunks(chunks):
242
+ # type: (list[str]) -> dict
243
+ """
244
+ Tokenize text chunks into model-compatible formats.
245
+
246
+ :param chunks: Text chunks to tokenize.
247
+ :return: Dictionary of tokenized data including input IDs, attention masks, and type IDs.
248
+ """
249
+ encodings = tokenizer().encode_batch(chunks)
250
+ input_ids = np.array([encoding.ids for encoding in encodings], dtype=np.int64)
251
+ attention_mask = np.array([encoding.attention_mask for encoding in encodings], dtype=np.int64)
252
+ type_ids = np.array([encoding.type_ids for encoding in encodings], dtype=np.int64)
253
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": type_ids}
254
+
255
+
256
+ def embed_chunks(chunks, batch_size=100):
257
+ """
258
+ Embed text chunks and return vector embeddings.
259
+
260
+ :param chunks: Text chunks to embed.
261
+ :param batch_size: Number of chunks to process in each batch.
262
+ :return: An array of embeddings for each chunk.
263
+ """
264
+ embeddings = []
265
+ for start_idx in range(0, len(chunks), batch_size):
266
+ batch_chunks = chunks[start_idx : start_idx + batch_size]
267
+ tokens = tokenize_chunks(batch_chunks)
268
+ token_embeddings = embed_tokens(tokens)
269
+ batch_embeddings = attention_pooling(token_embeddings, tokens["attention_mask"])
270
+ embeddings.append(batch_embeddings)
271
+ return np.vstack(embeddings)
272
+
273
+
274
+ def embed_tokens(tokens):
275
+ # type: (dict) -> NDArray
276
+ """
277
+ Create embeddings from tokenized text chunks using the model.
278
+
279
+ :param tokens: Tokenized text data.
280
+ :return: An array of embeddings.
281
+ """
282
+ result = model().run(None, tokens)
283
+ return np.array(result[0])
284
+
285
+
286
+ def attention_pooling(token_embeddings, attention_mask):
287
+ # type: (np.array, np.array) -> np.array
288
+ """
289
+ Apply attention mask based mean pooling to the token embeddings.
290
+
291
+ :param token_embeddings: Raw token embeddings from the model.
292
+ :param attention_mask: Attention masks for the embeddings.
293
+ :return: An array of pooled and normalized embeddings.
294
+ """
295
+ input_mask_expanded = attention_mask[:, :, None].astype(np.float32)
296
+ sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
297
+ sum_mask = np.clip(np.sum(input_mask_expanded, axis=1), a_min=1e-9, a_max=None)
298
+ mean_pooled = sum_embeddings / sum_mask
299
+ norm = np.linalg.norm(mean_pooled, ord=2, axis=1, keepdims=True)
300
+ result = mean_pooled / np.clip(norm, a_min=1e-9, a_max=None)
301
+ return result.astype(np.float32)
302
+
303
+
304
+ def mean_pooling(embeddings):
305
+ # type: (NDArray[np.float32]) -> NDArray
306
+ """
307
+ Calculate the document vector from chunk embeddings using mean pooling.
308
+
309
+ :param embeddings: Chunk embeddings.
310
+ :return: A normalized document vector.
311
+ """
312
+ document_vector = embeddings.mean(axis=0)
313
+ return document_vector / np.linalg.norm(document_vector)
314
+
315
+
316
+ def binarize(vec):
317
+ # type: (NDArray) -> bytes
318
+ """
319
+ Binarize an embedding vector into a hash digest.
320
+
321
+ :param vec: Vector to be binarized.
322
+ :return: A bytes object representing the binary hash.
323
+ """
324
+ return bytes((np.packbits(np.array(vec) >= 0)))
325
+
326
+
327
+ def compress(vec, precision):
328
+ # type: (NDArray, int) -> list[float]
329
+ """
330
+ Round down vector values to specified precision to reduce storage requirements.
331
+
332
+ :param vec: Embedding vector.
333
+ :param precision: Max number of fractional decimal places.
334
+ :return: Vector as native python list of rounded floats.
335
+ """
336
+ rounded_array = np.around(vec, decimals=precision)
337
+ compress_list = [round(x, precision) for x in rounded_array.tolist()]
338
+ return compress_list
iscc_sct/demo.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo showcasing ISCC Semantic Text Code.
3
+
4
+ The demo features:
5
+
6
+ - two side by side text inputs.
7
+ - One sample text per input (One sample in english and the other a german translation of it)
8
+ - One slider to set global bitlength (32-256 bits in steps of 32 with 64 as default)
9
+ - One result output per text input
10
+
11
+ The user can select the samples or write or paste text into the inputs and generate ISCC Semantic
12
+ Text Codes for the Texts. Below the result outputs we show the similarity of the two codes.
13
+ """
14
+
15
+ from loguru import logger as log
16
+ import gradio as gr
17
+ import iscc_sct as sct
18
+
19
+
20
+ def compute_iscc_code(text1, text2, bit_length):
21
+ code1 = sct.gen_text_code_semantic(text1, bits=bit_length)
22
+ code2 = sct.gen_text_code_semantic(text2, bits=bit_length)
23
+ similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length)
24
+ return code1["iscc"], code2["iscc"], similarity
25
+
26
+
27
+ def compare_codes(code_a, code_b, bits):
28
+ if all([code_a, code_b]):
29
+ return generate_similarity_bar(hamming_to_cosine(sct.iscc_distance(code_a, code_b), bits))
30
+
31
+
32
+ def hamming_to_cosine(hamming_distance: int, dim: int) -> float:
33
+ """Aproximate the cosine similarity for a given hamming distance and dimension"""
34
+ result = 1 - (2 * hamming_distance) / dim
35
+ return result
36
+
37
+
38
+ def generate_similarity_bar(similarity):
39
+ """Generate a horizontal bar representing the similarity value, scaled to -100% to +100%."""
40
+ # Scale similarity from [-1, 1] to [-100, 100]
41
+ display_similarity = similarity * 100
42
+
43
+ # Calculate the width of the bar based on the absolute value of similarity
44
+ bar_width = int(abs(similarity) * 50) # 50% is half the width of the container
45
+
46
+ # Determine the color and starting position based on the sign of the similarity
47
+ color = "green" if similarity >= 0 else "red"
48
+ position = "left" if similarity >= 0 else "right"
49
+
50
+ # Adjust the text position to be centered within the colored bar
51
+ text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
52
+ text_alignment = "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
53
+
54
+ bar_html = f"""
55
+ <h3>Semantic Similarity</h3>
56
+ <div style='width: 100%; border: 1px solid #ccc; height: 30px; position: relative; background-color: #eee;'>
57
+ <div style='height: 100%; width: {bar_width}%; background-color: {color}; position: absolute; {position}: 50%;'>
58
+ <span style='position: absolute; width: 100%; {text_position} top: 0; line-height: 30px; color: white; {text_alignment}'>{display_similarity:.2f}%</span>
59
+ </div>
60
+ </div>
61
+ """
62
+ return bar_html
63
+
64
+
65
+ # Sample texts
66
+ sample_text_en = "This is a sample text in English to demonstrate the ISCC-CODE generation."
67
+ sample_text_de = "Dies ist ein Beispieltext auf Deutsch, um die Erzeugung von ISCC-CODES zu demonstrieren."
68
+
69
+ custom_css = """
70
+ #chunked-text span.label {
71
+ text-transform: none !important;
72
+ }
73
+ """
74
+
75
+ iscc_theme = gr.themes.Default(
76
+ font=[gr.themes.GoogleFont("Readex Pro")],
77
+ font_mono=[gr.themes.GoogleFont("JetBrains Mono")],
78
+ radius_size=gr.themes.sizes.radius_none,
79
+ )
80
+
81
+ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
82
+ with gr.Row(variant="panel"):
83
+ gr.Markdown(
84
+ """
85
+ ## ✂️ ISCC Semantic Text-Code
86
+ Demo of cross-lingual Semantic Text-Code (proof of concept)
87
+ """,
88
+ )
89
+ with gr.Row(variant="panel"):
90
+ in_iscc_bits = gr.Slider(
91
+ label="ISCC Bit-Length",
92
+ info="NUMBER OF BITS FOR OUTPUT ISCC",
93
+ minimum=64,
94
+ maximum=256,
95
+ step=32,
96
+ value=64,
97
+ )
98
+ with gr.Row(variant="panel"):
99
+ with gr.Column(variant="panel"):
100
+ in_text_a = gr.TextArea(
101
+ label="Text",
102
+ placeholder="Paste your text here or select sample from below",
103
+ lines=12,
104
+ max_lines=12,
105
+ )
106
+
107
+ gr.Examples(label="Sample Text", examples=[sample_text_en], inputs=[in_text_a])
108
+ out_code_a = gr.Textbox(label="ISCC Code for Text A")
109
+ with gr.Column(variant="panel"):
110
+ in_text_b = gr.TextArea(
111
+ label="Text",
112
+ placeholder="Paste your text here or select sample from below",
113
+ lines=12,
114
+ max_lines=12,
115
+ )
116
+
117
+ gr.Examples(label="Sample Text", examples=[sample_text_de], inputs=[in_text_b])
118
+ out_code_b = gr.Textbox(label="ISCC Code for Text B")
119
+
120
+ with gr.Row(variant="panel"):
121
+ with gr.Column(variant="panel"):
122
+ out_similarity = gr.HTML(label="Similarity")
123
+
124
+ def process_text(text, nbits, suffix):
125
+ log.debug(f"{text[:20]}")
126
+ if not text:
127
+ return
128
+ out_code_func = globals().get(f"out_code_{suffix}")
129
+ iscc = sct.Metadata(**sct.gen_text_code_semantic(text, bits=nbits))
130
+ result = {out_code_func: gr.Textbox(value=iscc.iscc)}
131
+ return result
132
+
133
+ in_text_a.change(
134
+ lambda text, nbits: process_text(text, nbits, "a"),
135
+ inputs=[in_text_a, in_iscc_bits],
136
+ outputs=[out_code_a],
137
+ show_progress="full",
138
+ )
139
+ in_text_b.change(
140
+ lambda text, nbits: process_text(text, nbits, "b"),
141
+ inputs=[in_text_b, in_iscc_bits],
142
+ outputs=[out_code_b],
143
+ show_progress="full",
144
+ )
145
+
146
+ out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
147
+ out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
148
+ with gr.Row():
149
+ gr.ClearButton(components=[in_text_a, in_text_b])
150
+
151
+
152
+ if __name__ == "__main__": # pragma: no cover
153
+ demo.launch()
iscc_sct/main.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from iscc_sct.models import Metadata
2
+ from iscc_sct.code_semantic_text import gen_text_code_semantic
3
+ from iscc_sct.options import sct_opts
4
+
5
+
6
+ __all__ = [
7
+ "create",
8
+ ]
9
+
10
+
11
+ def create(text, granular=False, **options):
12
+ # type (str, bool) -> Metadata
13
+ """
14
+ Create Semantic Text-Code
15
+
16
+ High-Level API for creating Semantic Text-Code.
17
+
18
+ :param text: Text used for creating Semantic Text-Code.
19
+ :param granular: Activate options for granular processing (Default: False).
20
+ :param options: Override individual options for creating Semantic Text-Code.
21
+ :return: Semantic Text-Code `Metadata` object in Object-Format
22
+ """
23
+
24
+ # Override global options with individual options derived from `granular` parameter
25
+ granular = dict(simprints=True, offsets=True, sizes=True, contents=True) if granular else {}
26
+ opts = sct_opts.override(granular)
27
+
28
+ # Override local options with individual options form additional keyword arguments
29
+ opts = opts.override(options)
30
+
31
+ data = gen_text_code_semantic(text, **opts.model_dump())
32
+ return Metadata(**data).to_object_format()
iscc_sct/models.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ # Semantic-Code Text - Datamodel
3
+
4
+ This module provides the pydantic metadata schema for Semantic Text Code results.
5
+ The schema is conformant with https://schema.iscc.codes/
6
+
7
+ The `features` property of the top level Metadata Object supports two different formats for
8
+ representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**.
9
+ These formats are designed to offer flexibility in how feature data is structured and processed,
10
+ catering to different use cases where either performance or clarity is prioritized.
11
+
12
+ ## Features Index-Format (Compact Array Structure):
13
+
14
+ In this compact format, features are represented as a list of strings, with optional parallel arrays to
15
+ store related attributes such as `offsets`, `sizes`, and `contents`.
16
+
17
+ **Example**:
18
+
19
+ ```json
20
+ {
21
+ "maintype": "semantic",
22
+ "subtype": "text",
23
+ "version": 0,
24
+ "simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"],
25
+ "offsets": [0, 12],
26
+ "sizes": [12, 48],
27
+ "contents": ["textchunk no one", "textchunk no two"]
28
+ }
29
+
30
+ ```
31
+
32
+ **Use Case**:
33
+ - Best suited for scenarios where storage efficiency is critical, and the overhead of processing
34
+ multiple parallel arrays is acceptable.
35
+ - Useful when all features share the same set of attributes, allowing for faster bulk processing.
36
+
37
+ ## Features Object-Format (Self-Descriptive Object Structure):
38
+
39
+ In this convenient format, each feature is represented as an individual object containing its
40
+ attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but
41
+ easier to read and work with.
42
+
43
+ **Example**:
44
+
45
+ ```json
46
+ {
47
+ "maintype": "content",
48
+ "subtype": "text",
49
+ "version": 0,
50
+ "simprints": [
51
+ {
52
+ "simprint": "lUjuScFYBik",
53
+ "offset": 0,
54
+ "size": 25,
55
+ "content": "ISCC - Semantic Text-Code"
56
+ }
57
+ ]
58
+ }
59
+
60
+ ```
61
+ **Use Case**:
62
+ - Ideal for scenarios where clarity and readability are prioritized.
63
+ - Each feature is self-contained, making it easier to understand, extend, and debug.
64
+ - Flexibility in including or omitting optional attributes per feature.
65
+
66
+
67
+ ### Unified FeatureSet Schema:
68
+
69
+ The `FeatureSet` model unifies these two formats by allowing either structure to be used.
70
+ To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
71
+ """
72
+
73
+ from typing import List, Optional, Dict, Any, Union
74
+ from pydantic import BaseModel
75
+
76
+
77
+ __all__ = ["Feature", "FeatureSet", "Metadata"]
78
+
79
+
80
+ class PrettyBaseModel(BaseModel):
81
+ def __repr__(self):
82
+ return self.pretty_repr()
83
+
84
+ def pretty_repr(self):
85
+ return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)
86
+
87
+
88
+ class Feature(PrettyBaseModel):
89
+ simprint: str
90
+ offset: Optional[int] = None
91
+ size: Optional[int] = None
92
+ content: Optional[str] = None
93
+
94
+
95
+ class FeatureSet(PrettyBaseModel):
96
+ maintype: str = "semantic"
97
+ subtype: str = "text"
98
+ version: int = 0
99
+ embedding: Optional[List[float]] = None
100
+ simprints: Optional[
101
+ Union[
102
+ List[str], # Index-Format
103
+ List[Feature], # Object-Format
104
+ ]
105
+ ] = None
106
+ offsets: Optional[List[int]] = None
107
+ sizes: Optional[List[int]] = None
108
+ contents: Optional[List[str]] = None
109
+
110
+
111
+ class Metadata(PrettyBaseModel):
112
+ iscc: str
113
+ characters: Optional[int] = None
114
+ features: Optional[List[FeatureSet]] = None
115
+
116
+ def to_index_format(self) -> "Metadata":
117
+ """
118
+ Convert the Metadata object to use the Index-Format for features.
119
+ Returns a new Metadata object.
120
+ """
121
+ if not self.features:
122
+ return self.model_copy()
123
+
124
+ new_features = []
125
+ for feature_set in self.features:
126
+ new_feature_set = feature_set.model_copy()
127
+ if feature_set.simprints is None:
128
+ new_features.append(new_feature_set)
129
+ continue
130
+
131
+ if isinstance(feature_set.simprints[0], str):
132
+ new_features.append(new_feature_set)
133
+ else:
134
+ new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
135
+ new_feature_set.offsets = [f.offset for f in feature_set.simprints if f.offset is not None]
136
+ new_feature_set.sizes = [f.size for f in feature_set.simprints if f.size is not None]
137
+ new_feature_set.contents = [f.content for f in feature_set.simprints if f.content is not None]
138
+ new_features.append(new_feature_set)
139
+
140
+ return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
141
+
142
+ def to_object_format(self) -> "Metadata":
143
+ """
144
+ Convert the Metadata object to use the Object-Format for features.
145
+ Returns a new Metadata object.
146
+ """
147
+ if not self.features:
148
+ return self.model_copy()
149
+
150
+ new_features = []
151
+ for feature_set in self.features:
152
+ new_feature_set = feature_set.model_copy()
153
+ if feature_set.simprints is None:
154
+ new_features.append(new_feature_set)
155
+ continue
156
+
157
+ if isinstance(feature_set.simprints[0], Feature):
158
+ new_features.append(new_feature_set)
159
+ else:
160
+ new_simprints = []
161
+ for i, simprint in enumerate(feature_set.simprints):
162
+ feature = Feature(simprint=simprint)
163
+ if feature_set.offsets and i < len(feature_set.offsets):
164
+ feature.offset = feature_set.offsets[i]
165
+ if feature_set.sizes and i < len(feature_set.sizes):
166
+ feature.size = feature_set.sizes[i]
167
+ if feature_set.contents and i < len(feature_set.contents):
168
+ feature.content = feature_set.contents[i]
169
+ new_simprints.append(feature)
170
+ new_feature_set.simprints = new_simprints
171
+ new_feature_set.offsets = None
172
+ new_feature_set.sizes = None
173
+ new_feature_set.contents = None
174
+ new_features.append(new_feature_set)
175
+
176
+ return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
iscc_sct/options.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ from pydantic import Field
3
+ from pydantic_settings import BaseSettings, SettingsConfigDict
4
+
5
+
6
+ __all__ = [
7
+ "SctOptions",
8
+ "sct_opts",
9
+ ]
10
+
11
+
12
+ load_dotenv()
13
+
14
+
15
+ class SctOptions(BaseSettings):
16
+ bits: int = Field(
17
+ 64,
18
+ description="ISCC_SCT_BITS - Default bit-length of generated Semantic Text-Code in bits",
19
+ ge=32,
20
+ le=256,
21
+ multiple_of=32,
22
+ )
23
+
24
+ bits_granular: int = Field(
25
+ 64,
26
+ description="ISCC_SCT_BITS_GRANULAR - Default bit-length of granular features",
27
+ ge=32,
28
+ le=256,
29
+ multiple_of=32,
30
+ )
31
+
32
+ characters: bool = Field(True, description="ISCC_SCT_CHARACTERS - Include document character count")
33
+ embedding: bool = Field(False, description="ISCC_SCT_EMBEDDING - Include global document embedding")
34
+
35
+ precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")
36
+
37
+ simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
38
+ offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")
39
+
40
+ sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")
41
+
42
+ contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
43
+
44
+ max_tokens: int = Field(
45
+ 127,
46
+ description="ISCC_SCT_MAX_TOKENS - Max tokens per chunk (Default 127)",
47
+ le=127,
48
+ )
49
+
50
+ overlap: int = Field(
51
+ 48,
52
+ description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
53
+ )
54
+
55
+ trim: bool = Field(False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)")
56
+
57
+ model_config = SettingsConfigDict(
58
+ env_file=".env",
59
+ env_file_encoding="utf-8",
60
+ env_prefix="ISCC_SCT_",
61
+ extra="ignore",
62
+ validate_assignment=True,
63
+ )
64
+
65
+ def override(self, update=None):
66
+ # type: (dict|None) -> SctOptions
67
+ """Returns an updated and validated deep copy of the current settings instance."""
68
+
69
+ update = update or {} # sets {} if update is None
70
+
71
+ opts = self.model_copy(deep=True)
72
+ # We need update fields individually so validation gets triggered
73
+ for field, value in update.items():
74
+ setattr(opts, field, value)
75
+ return opts
76
+
77
+
78
+ sct_opts = SctOptions()
iscc_sct/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
iscc_sct/utils.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from base64 import b32encode, b32decode
3
+ from pybase64 import urlsafe_b64encode, urlsafe_b64decode
4
+ from loguru import logger as log
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ from urllib.request import urlretrieve
9
+ from blake3 import blake3
10
+ from platformdirs import PlatformDirs
11
+
12
+
13
+ APP_NAME = "iscc-sct"
14
+ APP_AUTHOR = "iscc"
15
+ dirs = PlatformDirs(appname=APP_NAME, appauthor=APP_AUTHOR)
16
+ os.makedirs(dirs.user_data_dir, exist_ok=True)
17
+
18
+
19
+ __all__ = [
20
+ "timer",
21
+ "get_model",
22
+ "encode_base32",
23
+ "encode_base64",
24
+ "hamming_distance",
25
+ "iscc_distance",
26
+ "MODEL_PATH",
27
+ ]
28
+
29
+
30
+ BASE_VERSION = "1.0.0"
31
+ BASE_URL = f"https://github.com/iscc/iscc-binaries/releases/download/v{BASE_VERSION}"
32
+ MODEL_FILENAME = "iscc-sct-v0.1.0.onnx"
33
+ MODEL_URL = f"{BASE_URL}/{MODEL_FILENAME}"
34
+ MODEL_PATH = Path(dirs.user_data_dir) / MODEL_FILENAME
35
+ MODEL_CHECKSUM = "ff254d62db55ed88a1451b323a66416f60838dd2f0338dba21bc3b8822459abc"
36
+
37
+
38
+ class timer:
39
+ def __init__(self, message: str):
40
+ self.message = message
41
+
42
+ def __enter__(self):
43
+ # Record the start time
44
+ self.start_time = time.perf_counter()
45
+
46
+ def __exit__(self, exc_type, exc_value, traceback):
47
+ # Calculate the elapsed time
48
+ elapsed_time = time.perf_counter() - self.start_time
49
+ # Log the message with the elapsed time
50
+ log.debug(f"{self.message} {elapsed_time:.4f} seconds")
51
+
52
+
53
+ def get_model(): # pragma: no cover
54
+ """Check and return local model file if it exists, otherwise download."""
55
+ if MODEL_PATH.exists():
56
+ try:
57
+ return check_integrity(MODEL_PATH, MODEL_CHECKSUM)
58
+ except RuntimeError:
59
+ log.warning("Model file integrity error - redownloading ...")
60
+ urlretrieve(MODEL_URL, filename=MODEL_PATH)
61
+ else:
62
+ log.info("Downloading embedding model ...")
63
+ urlretrieve(MODEL_URL, filename=MODEL_PATH)
64
+ return check_integrity(MODEL_PATH, MODEL_CHECKSUM)
65
+
66
+
67
+ def check_integrity(file_path, checksum):
68
+ # type: (str|Path, str) -> Path
69
+ """
70
+ Check file integrity against blake3 checksum
71
+
72
+ :param file_path: path to file to be checked
73
+ :param checksum: blake3 checksum to verify integrity
74
+ :raises RuntimeError: if verification fails
75
+ """
76
+ file_path = Path(file_path)
77
+ file_hasher = blake3(max_threads=blake3.AUTO)
78
+ with timer("INTEGRITY check time"):
79
+ file_hasher.update_mmap(file_path)
80
+ file_hash = file_hasher.hexdigest()
81
+ if checksum != file_hash:
82
+ msg = f"Failed integrity check for {file_path.name}"
83
+ log.error(msg)
84
+ raise RuntimeError(msg)
85
+ return file_path
86
+
87
+
88
+ def encode_base32(data):
89
+ # type: (bytes) -> str
90
+ """
91
+ Standard RFC4648 base32 encoding without padding.
92
+
93
+ :param bytes data: Data for base32 encoding
94
+ :return: Base32 encoded str
95
+ """
96
+ return b32encode(data).decode("ascii").rstrip("=")
97
+
98
+
99
+ def decode_base32(code):
100
+ # type: (str) -> bytes
101
+ """
102
+ Standard RFC4648 base32 decoding without padding and with casefolding.
103
+ """
104
+ # python stdlib does not support base32 without padding, so we have to re-pad.
105
+ cl = len(code)
106
+ pad_length = math.ceil(cl / 8) * 8 - cl
107
+
108
+ return bytes(b32decode(code + "=" * pad_length, casefold=True))
109
+
110
+
111
+ def encode_base64(data):
112
+ # type: (bytes) -> str
113
+ """
114
+ Standard RFC4648 base64url encoding without padding.
115
+ """
116
+ code = urlsafe_b64encode(data).decode("ascii")
117
+ return code.rstrip("=")
118
+
119
+
120
+ def decode_base64(code):
121
+ # type: (str) -> bytes
122
+ """
123
+ Standard RFC4648 base64url decoding without padding.
124
+ """
125
+ padding = 4 - (len(code) % 4)
126
+ string = code + ("=" * padding)
127
+ return urlsafe_b64decode(string)
128
+
129
+
130
+ def hamming_distance(a, b):
131
+ # type: (bytes, bytes) -> int
132
+ """
133
+ Calculate the bitwise Hamming distance between two bytes objects.
134
+
135
+ :param a: The first bytes object.
136
+ :param b: The second bytes object.
137
+ :return: The Hamming distance between two bytes objects.
138
+ :raise ValueError: If a and b are not the same length.
139
+ """
140
+ if len(a) != len(b):
141
+ raise ValueError("The lengths of the two bytes objects must be the same")
142
+
143
+ distance = 0
144
+ for b1, b2 in zip(a, b):
145
+ xor_result = b1 ^ b2
146
+ distance += bin(xor_result).count("1")
147
+
148
+ return distance
149
+
150
+
151
+ def iscc_distance(iscc1, iscc2):
152
+ # type: (str, str) -> int
153
+ """
154
+ Calculate the Hamming distance between two ISCC Semantic Text Codes.
155
+
156
+ :param iscc1: The first ISCC Semantic Text Code.
157
+ :param iscc2: The second ISCC Semantic Text Code.
158
+ :return: The Hamming distance between the two ISCC codes.
159
+ :raise ValueError: If the input ISCCs are not valid or of different lengths.
160
+ """
161
+ # Remove the "ISCC:" prefix if present
162
+ iscc1 = iscc1[5:] if iscc1.startswith("ISCC:") else iscc1
163
+ iscc2 = iscc2[5:] if iscc2.startswith("ISCC:") else iscc2
164
+
165
+ # Decode the base32-encoded ISCCs
166
+ decoded1 = decode_base32(iscc1)
167
+ decoded2 = decode_base32(iscc2)
168
+
169
+ # Check if the decoded ISCCs have the same length
170
+ if len(decoded1) != len(decoded2):
171
+ raise ValueError("The input ISCCs must have the same length")
172
+
173
+ # Remove the 2-byte header from each decoded ISCC
174
+ content1 = decoded1[2:]
175
+ content2 = decoded2[2:]
176
+
177
+ # Calculate and return the Hamming distance
178
+ return hamming_distance(content1, content2)
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "iscc-sct"
3
+ version = "0.1.2"
4
+ description = "ISCC - Semantic Code Text"
5
+ authors = ["Titusz <[email protected]>"]
6
+ license = "CC-BY-NC-SA-4.0"
7
+ readme = "README.md"
8
+ homepage = "https://iscc.codes"
9
+ repository = "https://github.com/iscc/iscc-sct"
10
+ documentation = "https://github.com/iscc/iscc-sct"
11
+ keywords=["iscc", "text similarity", "cross lingual", "semantic similarity"]
12
+ classifiers=[
13
+ "Development Status :: 4 - Beta",
14
+ "Intended Audience :: Developers",
15
+ "Intended Audience :: Science/Research",
16
+ "License :: OSI Approved :: Apache Software License",
17
+ "Natural Language :: English",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Text Processing",
24
+ "Topic :: Text Processing :: General",
25
+ "Topic :: Text Processing :: Indexing",
26
+ "Topic :: Text Processing :: Linguistic",
27
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
28
+ "Topic :: Multimedia :: Graphics",
29
+ "Topic :: Scientific/Engineering :: Image Recognition",
30
+ "Topic :: Scientific/Engineering :: Information Analysis",
31
+ "Topic :: Software Development :: Libraries :: Python Modules",
32
+ "Topic :: Software Development :: Libraries",
33
+ "Topic :: Software Development :: Libraries :: Python Modules",
34
+ "Topic :: System :: Archiving",
35
+ "Topic :: System :: Clustering",
36
+ "Topic :: System :: Distributed Computing",
37
+ ]
38
+
39
+ [tool.poetry.urls]
40
+ "Changelog" = "https://github.com/iscc/iscc-sct/blob/main/CHANGELOG.md"
41
+ "Bug Tracker" = "https://github.com/iscc/iscc-sct/issues"
42
+ "Twitter" = "https://twitter.com/iscc_foundation"
43
+ "Donate" = "https://iscc.foundation/support"
44
+
45
+ [tool.poetry.scripts]
46
+ sct = 'iscc_sct.cli:main'
47
+
48
+ [tool.poetry.dependencies]
49
+ python = ">=3.9,<3.13"
50
+ semantic-text-splitter = "*"
51
+ onnxruntime = "*"
52
+ onnxruntime-gpu = { version = "*", optional = true }
53
+ loguru = "*"
54
+ blake3 = "*"
55
+ platformdirs = "*"
56
+ tokenizers = "*"
57
+ pydantic-settings = "*"
58
+ charset-normalizer = "*"
59
+ numpy = "<2.0.0"
60
+ pybase64 = "^1.4.0"
61
+ certifi = ">=2024.07.04"
62
+ gradio = { version = "*", optional = true }
63
+
64
+
65
+ [tool.poetry.extras]
66
+ gpu = ["onnxruntime-gpu"]
67
+ demo = ["gradio"]
68
+
69
+ [tool.poetry.group.test.dependencies]
70
+ pytest = "*"
71
+ coverage = "*"
72
+ pytest-cov = "*"
73
+
74
+ [tool.poetry.group.dev.dependencies]
75
+ poethepoet = "*"
76
+ ruff = "*"
77
+ mdformat-gfm = "*"
78
+ mdformat-gfm-alerts = "*"
79
+
80
+ [tool.ruff]
81
+ line-length = 119
82
+
83
+ [tool.ruff.format]
84
+ line-ending = "lf"
85
+
86
+ [tool.poe.tasks]
87
+ format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
88
+ format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
89
+ test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100 --cov-report=term-missing --color=yes", help = "Run tests with coverage" }
90
+ all = ["format-code", "format-markdown", "test"]
91
+
92
+ [build-system]
93
+ requires = ["poetry-core>=1.0.0"]
94
+ build-backend = "poetry.core.masonry.api"
space.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: ISCC-LAB - Semantic-Code Text
2
+ emoji: ▶️
3
+ colorFrom: red
4
+ colorTo: blue
5
+ sdk: gradio
6
+ sdk_version: 4.41.0
7
+ app_file: iscc_sct/demo.py
8
+ pinned: true
9
+ license: CC-BY-NC-SA-4.0
10
+ python_version: 3.12
11
+ short_description: Cross Lingual Similarity Preserving Text Simprints
12
+ description: >
13
+ # ISCC-LAB - Semantic-Code Text
14
+
15
+ `iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
16
+ [ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
17
+ short identifiers created from text documents that preserve similarity (in hamming distance)
18
+ for semantically similar cross-lingual text inputs.
19
+
20
+ ## What is the ISCC
21
+
22
+ The ISCC is a combination of various similarity preserving fingerprints and an identifier for
23
+ digital media content.
24
+
25
+ ISCCs are generated algorithmically from digital content, just like cryptographic hashes. However,
26
+ instead of using a single cryptographic hash function to identify data only, the ISCC uses various
27
+ algorithms to create a composite identifier that exhibits similarity-preserving properties (soft
28
+ hash or Simprint).
29
+
30
+ The component-based structure of the ISCC identifies content at multiple levels of abstraction. Each
31
+ component is self-describing, modular, and can be used separately or with others to aid in various
32
+ content identification tasks. The algorithmic design supports content deduplication, database
33
+ synchronization, indexing, integrity verification, timestamping, versioning, data provenance,
34
+ similarity clustering, anomaly detection, usage tracking, allocation of royalties, fact-checking and
35
+ general digital asset management use-cases.
36
+
37
+
38
+ ## ISCC Status
39
+
40
+ The [ISCC](https://iscc.codes) is an ISO Standrad published under
41
+ [ISO 24138:2024](https://www.iso.org/standard/77899.html) - International Standard Content Code
42
+ within [ISO/TC 46/SC 9/WG 18](https://www.iso.org/committee/48836.html).
43
+
44
+ The algorithms of this `iscc-sct` are experimental and not (yet) part of the official standard.
tests/__init__.py ADDED
File without changes
tests/benchmark.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ from loguru import logger as log
3
+ from pathlib import Path
4
+ import iscc_sct as sct
5
+ import argparse
6
+ import time
7
+
8
+
9
+ def benchmark(folder):
10
+ """
11
+ Benchmark Text-Code generation for all text files in `folder`.
12
+
13
+ Per file stats are logged to the console during processing.
14
+ Comprehensive aggregated statistics are shown after processing all images
15
+
16
+ :param folder: Folder containing text files for benchmarking
17
+ """
18
+ folder = Path(folder)
19
+ assert folder.is_dir(), f"{folder} is not a directory."
20
+
21
+ total_time = 0
22
+ file_count = 0
23
+
24
+ for txt_path in folder.glob("*.txt"):
25
+ start_time = time.time()
26
+ try:
27
+ iscc_meta = sct.code_text_semantic(txt_path)
28
+ except Exception as e:
29
+ log.error(f"Processing {txt_path.name} failed: {e}")
30
+ continue
31
+ end_time = time.time()
32
+ elapsed_time = end_time - start_time
33
+ total_time += elapsed_time
34
+ file_count += 1
35
+ log.info(f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}")
36
+
37
+ if file_count > 0:
38
+ avg_time = total_time / file_count
39
+ log.info(
40
+ f"Processed {file_count} files in {total_time:.2f} seconds. Average time per file: {avg_time:.2f} seconds."
41
+ )
42
+ else:
43
+ log.warning("No text files found in the provided folder.")
44
+
45
+
46
+ def main():
47
+ parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
48
+ parser.add_argument("folder", type=str, help="Directory containing text files for benchmarking.")
49
+ args = parser.parse_args()
50
+
51
+ benchmark(args.folder)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ main()
tests/conftest.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from pathlib import Path
3
+
4
+
5
+ HERE = Path(__file__).parent.absolute()
6
+
7
+
8
+ @pytest.fixture
9
+ def text_en():
10
+ return (HERE / "en.txt").read_text(encoding="utf-8")
11
+
12
+
13
+ @pytest.fixture
14
+ def text_de():
15
+ return (HERE / "de.txt").read_text(encoding="utf-8")
tests/de.txt ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Vielen Dank, Chris.
3
+
4
+ Es ist mir wirklich eine Ehre, zweimal auf dieser Bühne stehen zu dürfen.
5
+ Tausend Dank dafür.
6
+
7
+ Ich bin wirklich begeistert von dieser Konferenz, und ich danke Ihnen allen für die vielen netten Kommentare zu meiner Rede vorgestern Abend.
8
+ Das meine ich ernst, teilweise deshalb -- weil ich es wirklich brauchen kann!
9
+ (Lachen) Versetzen Sie sich mal in meine Lage!
10
+ (Lachen) (Applaus) Ich bin bin acht Jahre lang mit der Air Force Two geflogen.
11
+ Jetzt muss ich meine Schuhe ausziehen, um überhaupt an Bord zu kommen!
12
+ (Applaus) Ich erzähle Ihnen mal eine Geschichte, dann verstehen Sie mich vielleicht besser.
13
+ Eine wahre Geschichte -- kein Wort daran ist erfunden.
14
+ Kurz nachdem Tipper und ich aus dem (vorgetäuschtes Schluchzen) Weißen Haus ausgezogen waren, fuhren wir von unserem Haus in Nashville zu unserer kleinen Farm 50 Meilen östlich von Nashville --
15
+ und wir fuhren selbst.
16
+
17
+ (Lachen) Ich weiß, für Sie ist das nichts Ungewöhnliches, aber ...
18
+ (Lachen) Ich sah in den Rückspiegel und plötzlich traf mich eine Erkenntnis.
19
+
20
+ Hinter mir war gar keine Autokolonne.
21
+ Haben Sie schon mal vom Phantomschmerz gehört?
22
+ (Lachen)
23
+
24
+ Wir saßen in einem gemieteten Ford Taurus.
25
+ Es war Zeit zum Abendessen und wir hielten Ausschau nach einem Restaurant.
26
+
27
+ Wir waren auf der I-40.
28
+ Wir kamen zur Ausfahrt 238, Lebanon, Tennessee.
29
+ Wir fuhren ab und suchten nach einem ... wir fanden schließlich ein Shoney's.
30
+ Für alle, die es nicht kennen: Das ist eine billige Familienrestaurantkette.
31
+
32
+ Wir gingen rein und setzten uns in eine Nische.
33
+ Die Kellnerin kam zu uns und machte viel Aufhebens um Tipper.
34
+
35
+ Sie nahm unsere Bestellung auf, ging dann zum Paar in der Nische neben uns und senkte ihre Stimme so sehr, dass ich mich richtig anstrengen musste, um sie zu verstehen.
36
+ Sie sagte: "Ja, das ist Ex-Vizepräsident Al Gore und seine Frau Tipper."
37
+ Und der Mann antwortete: "Ganz schöner Abstieg, was?"
38
+ (Lachen) Es gab eine ganze Reihe solcher Offenbarungen.
39
+ Am nächsten Tag -- immer noch eine wahre Geschichte! -- flog ich in einer G5 nach Afrika, um in Nigeria eine Rede zu halten, in Lagos, und zwar über das Thema Energie.
40
+ Zu Beginn der Rede erzählte ich, was mir am Vortag in Nashville passiert war.
41
+
42
+ Ich erzählte es genau so, wie ich es Ihnen gerade erzählt habe.
43
+ Tipper und ich fuhren selbst, Shoney's, billige Familienrestaurantkette, was der Mann gesagt hatte -- alle lachten.
44
+
45
+ Ich hielt meine Rede, dann fuhr ich zurück zum Flughafen, um nach Hause zu fliegen.
46
+ Im Flugzeug schlief ich, bis wir mitten in der Nacht auf den Azoren landeten, um zu tanken.
47
+
48
+ Ich wachte auf, öffnete die Tür und ging hinaus, um frische Luft zu schnappen.
49
+ Da sah ich plötzlich einen Mann über das Rollfeld rennen.
50
+
51
+
52
+ Er wedelte mit einem Stück Papier und schrie: "Rufen Sie Washington an!
53
+ Rufen Sie Washington an!"
54
+
55
+
56
+ Ich dachte so: Mitten in der Nacht, mitten im Atlantik, was in der Welt könnte in Washington schief laufen?
57
+ Dann fiel mir ein, dass da so einiges in Frage kam.
58
+
59
+ (Lachen)
60
+
61
+ (Applaus) Aber mein Mitarbeiter war wegem Folgenden so aufgeregt: Eine der nigerianischen Nachrichtenagenturen hatte schon eine Story über meine Rede herausgegeben.
62
+ Und die war schon in Städten überall in den USA gedruckt worden --
63
+
64
+ auch in Monterey, das habe ich überprüft.
65
+
66
+ Und die Geschichte begann mit: "Ex-Vizepräsident Al Gore gab gestern in Nigeria bekannt: 'Meine Frau Tipper und ich haben ein billiges Familienrestaurant namens Shoney's eröffnet und wir führen es selbst.'"
67
+ Bevor ich wieder amerikanischen Boden betrat, machten David Letterman und Jay Leno schon Witze über mich -- einer von ihnen zeigte mich mit einer großen weißen Kochmütze und Tipper sagte: "Noch einen Burger mit Pommes!"
68
+
69
+ Drei Tage später bekam ich einen netten, langen, handgeschriebenen Brief von meinem Freund, Partner und Kollegen Bill Clinton, in dem er schrieb: "Glückwunsch zum neuen Restaurant, Al!"
70
+ (Lachen) Wir freuen uns immer, wenn der andere Erfolg im Leben hat.
71
+ Ich wollte eigentlich über Informationsökologie sprechen.
72
+ Aber ich dachte, da ich ohnehin noch sehr oft zu TED zurückkommen will, könnte ich das vielleicht auf ein anderes Mal verschieben.
73
+ Chris Anderson: Abgemacht!
74
+
75
+ Ich möchte mich auf das konzentrieren, was viele von Ihnen von mir hören wollen.
76
+ Was kann jeder Einzelne gegen die Klimakrise tun?
77
+
78
+
79
+ Ich möchte beginnen mit ...
80
+ Ich werde einige neue Bilder zeigen und nur vier oder fünf noch mal durchgehen.
81
+
82
+ Ein Wort zur Diashow.
83
+ Ich aktualisiere sie jedes Mal, bevor ich sie zeige.
84
+ Ich füge neue Bilder hinzu, weil ich jedes Mal wieder etwas dazulerne.
85
+ Wie beim Strandgutsammeln -- jedes Mal, wenn die Flut da war,
86
+ findet man neue Muschelschalen.
87
+ Erst in den letzten beiden Tagen hatten wir neue Januar-Temperaturrekorde.
88
+ Das gilt jetzt nur für die USA.
89
+
90
+ Der historische Durchschnitt für Januar liegt bei minus 0,6 Grad.
91
+ Im letzten Monat waren es plus 4,2 Grad.
92
+
93
+ Ich weiß ja, dass Sie auf weitere schlechte Umweltnachrichten warten -- Ich mache nur Spaß --
94
+ aber jetzt kommt erst mal eine kurze Wiederholung und dann zeige ich Ihnen neues Material über mögliche Lösungen.
95
+ Aber erst wollte ich zu einigen Dias noch etwas sagen.
96
+ Zunächst steuern wir hier mit dem US-Beitrag zur Erderwärmung hin, wenn nichts unternommen wird.
97
+ Endverbraucher-Effizienz bei Strom und anderen Energien, das sind die niedrig hängenden Trauben.
98
+ Effizienz und Umweltschutz: Das ist kein Kostenfaktor, sondern ein Gewinnfaktor.
99
+ Das Vorzeichen ist falsch.
100
+ Es ist nicht negativ, sondern positiv.
101
+ Diese Investitionen amortisieren sich von selbst.
102
+ Aber sie lenken uns auch sehr effektiv vom richtigen Weg ab.
103
+ Autos und LKW -- darüber habe ich in der Diashow schon gesprochen, aber ich möchte, dass Sie es im rechten Licht betrachten.
104
+ Das ist ein einfacher, sichtbarer Kritikpunkt, und so sollte es auch sein, aber Gebäude haben einen größeren Anteil an der Erderwärmung als Autos und LKW.
105
+ Autos und LKW sind sehr wichtig, und wir haben die weltweit niedrigsten Normen,
106
+ daher sollten wir das Thema anpacken.
107
+
108
+ Aber es ist nur ein Teil des Ganzen.
109
+ Die Effizienz anderer Transportmittel ist ebenso wichtig wie bei Autos und LKW!
110
+
111
+ Erneuerbare Energien können bei der derzeitigen Technologieeffizienz einiges ausmachen, und nach den Aussagen von Vinod, John Doerr und anderen,
112
+ vielen von Ihnen -- hier sind viele Menschen direkt beteiligt -- wird dieser Keil viel schneller wachsen, als die aktuelle Projektion zeigt.
113
+ Die CO2-Sequestrierung -- abgekürzt CCS -- wird sich wahrscheinlich zum ultimativen Werkzeug entwickeln, mit dem wir fossile Brennstoffe auf sichere Weise weiterhin nutzen können.
114
+ Da sind wir noch nicht ganz.
115
+ Was kann nun der Einzelne tun?
116
+ Emissionen im eigenen Haus reduzieren.
117
+ Die meisten dieser Ausgaben sparen langfristig auch Geld.
118
+ Isolierung, besseres Baudesign, kaufen Sie möglichst umweltfreundlichen Strom.
119
+ Ich sprach von Autos -- kaufen Sie eins mit Hybridantrieb.
120
+ Nutzen Sie den öffentlichen Verkehr.
121
+ Sehen Sie sich nach anderen, besseren Lösungen um.
122
+ Das ist wichtig.
123
+ Kaufen Sie "grün".
124
+ Bei allem, was Sie einkaufen, haben Sie die Wahl zwischen Produkten mit ungünstigen und deutlich weniger ungünstigen Auswirkungen auf die globale Klimakrise.
125
+ Entscheiden Sie sich für ein CO2-neutrales Leben.
126
+ Diejenigen von Ihnen, die sich mit Slogans auskennen, wäre ich sehr dankbar für Tipps und Hilfe, wie man das so formulieren kann, dass es bei der Masse ankommt.
127
+ Es ist einfacher, als Sie glauben.
128
+
129
+ Wirklich.
130
+ Viele von uns hier haben diese Entscheidung getroffen, und es ist wirklich nicht schwer.
131
+
132
+ Reduzieren Sie Ihre CO2-Emissionen durch jede Wahl, die Sie treffen können, und kaufen oder erwerben Sie einen Ausgleich für den Rest, den Sie nicht vermeiden können.
133
+ Genauer wird das auf climatecrisis.net erklärt.
134
+ Da gibt es einen CO2-Rechner.
135
+ Participant Productions hat unter meiner aktiven Teilnahme die führenden Programmierer der Welt zusammengerufen, um aus dieser geheimnisvollen Kunst der CO2-Berechnung einen anwenderfreundlichen CO2-Rechner zu basteln.
136
+ Sie können sehr genau Ihre persönlichen CO2-Emissionen berechnen und erfahren dann Möglichkeiten, sie zu reduzieren.
137
+ Bis zum Fillmstart im Mai wird es ein Update auf Version 2.0 geben, in der man sich dann direkt zum Kauf von Ausgleichseinheiten durchklicken kann.
138
+ Versuchen Sie, Ihr Unternehmen CO2-neutral zu führen.
139
+ Auch das haben einige hier schon getan, und es ist leichter, als man denkt.
140
+ Beziehen Sie Klimalösungen in Ihre Innovationen mit ein, egal, ob Sie im Bereich Technologie, Unterhaltung oder Bauwesen und Architektur arbeiten.
141
+ Investieren Sie nachhaltig.
142
+ Davon hat Majora schon gesprochen.
143
+ Wenn Sie Geld in Manager investieren, die Sie auf der Grundlage ihrer Jahresleistung entlohnen, dann beklagen Sie sich nie wieder über kurzfristiges Management.
144
+ Langfristig tun die Leute, wofür man sie bezahlt.
145
+ Und wenn sie aufgrund von kurzfristigen Gewinnen beurteilen, wie viel sie aus Ihrem investierten Kapital herausholen können, dann treffen sie kurzfristige Entscheidungen.
146
+ Darüber lässt sich noch so einiges sagen.
147
+ Werden Sie ein Katalysator für den Wandel.
148
+ Lehren Sie andere, lernen Sie, reden Sie darüber.
149
+ Der Film ist eine Filmversion der Diashow, die ich vorgestern gezeigt habe, nur viel unterhaltsamer.
150
+ Und er kommt im Mai heraus.
151
+ Viele von Ihnen hier können dafür sorgen, dass eine Menge Leute ihn sehen.
152
+ Schicken Sie jemanden nach Nashville.
153
+ Suchen Sie ihn sorgfältig aus.
154
+ Ich werde persönlich Menschen schulen, diese Diashow zu zeigen, abgewandelt, die persönlichen Geschichten werden natürlich durch etwas Allgemeineres ersetzt, Es geht nicht nur um die Dias, sondern um ihre Bedeutung, ihren Zusammenhang.
155
+ Daher werde ich in diesem Sommer einen Kurs für eine Gruppe von Menschen abhalten, die von verschiedenen Leuten dafür nominiert werden, diesen Vortrag massenweise in Gemeinden im ganzen Land zu halten, und wir werden die Diashow für alle jede Woche aktualisieren, damit sie immer topaktuell ist.
156
+ In Zusammenarbeit mit Larry Lessig wird sie demnächst auch mit Tools und eingeschränkten Nutzungsrechten veröffentlicht werden, damit junge Leute eigene Remixe herstellen und sie auf ihre Art präsentieren können.
157
+ (Applaus) Woher stammt nur die Empfehlung, von Politik sollte man sich fernhalten?
158
+ Das bedeutet nicht, dass ich versuchen will, aus Republikanern Demokraten zu machen.
159
+
160
+ Wir brauchen auch Republikaner.
161
+ Das war früher ein parteiübergreifendes Thema,
162
+
163
+ und in dieser Gruppe ist es das ebenfalls.
164
+ Werden Sie politisch aktiv!
165
+ Sorgen Sie dafür, dass unsere Demokratie so funktioniert, wie sie sollte.
166
+ Unterstützen Sie die Beschränkung von CO2-Emissionen, Umweltverschmutzung und Emissionshandel.
167
+ Und zwar aus folgendem Grund: Solange die USA außen vor bleiben, ist das Weltsystem kein geschlossenes System.
168
+
169
+ Sobald es ein geschlossenes System wird, mit Beteiligung der USA, dann wird jedes Vorstandsmitglied ...
170
+ Wie viele von Ihnen sitzen im Vorstand eines Unternehmens?
171
+
172
+ In einem geschlossenen System sind Sie gesetzlich haftbar, wenn Sie den Vorstandsvorsitzenden nicht drängen, maximales Einkommen aus der Reduzierung und dem Handel mit unvermeidlichen CO2-Emissionen zu erzielen.
173
+ Der Markt wird dieses Problem lösen -- wenn wir das schaffen.
174
+ Helfen Sie bei der groß angelegten Meinungskampagne, die im Frühling beginnt.
175
+ Wir müssen die Amerikaner zum Umdenken bringen.
176
+ Denn gegenwärtig dürfen die Politiker nicht das tun, was getan werden muss.
177
+ In unserem modernen Land ist es nicht mehr wie früher Aufgabe von Logik und Vernunft, zwischen Wohlstand und Macht zu vermitteln, wie es früher einmal war.
178
+ Heute brauchen wir viele kurze, brandaktuelle 28 - 30 Sekunden lange Fernsehspots.
179
+ Wir müssen eine Menge solcher Spots kaufen.
180
+ Wir sollten die Erderwärmung umbenennen, wie viele von Ihnen vorgeschlagen haben.
181
+
182
+ Mir gefällt "Klimakrise" besser als "Klimakatastrophe".
183
+ Aber noch einmal, ich brauche Hilfe von Leuten, die sich mit Markenentwicklung auskennen.
184
+
185
+ Ein Wissenschaftler sagte mal zu mir, wir stünden jetzt vor der Prüfung, ob die Kombination aus einem opponierbaren Daumen und einem Neocortex überhaupt lebensfähig ist.
186
+ Das stimmt tatsächlich.
187
+ Wie ich vorgestern bereits sagte: Das ist kein politisches Thema.
188
+ Noch einmal an alle Republikaner hier: Es sollte dabei nicht um Parteipolitik gehen.
189
+ Sie haben mehr Einfluss als einige von uns Demokraten.
190
+ Dies ist eine Gelegenheit.
191
+ Nicht nur das, sondern in Verbindung mit den anderen Ideen hier können wir einen größeren Zusammenhang herstellen.
192
+ Wir sind eins.
193
+ Ich danke Ihnen vielmals.
194
+ (Applaus)
tests/en.txt ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Thank you so much, Chris.
3
+ And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.
4
+ I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.
5
+ And I say that sincerely, partly because (Mock sob) I need that.
6
+ (Laughter) Put yourselves in my position.
7
+ (Laughter) I flew on Air Force Two for eight years.
8
+ (Laughter) Now I have to take off my shoes or boots to get on an airplane!
9
+ (Laughter) (Applause) I'll tell you one quick story to illustrate what that's been like for me.
10
+ (Laughter) It's a true story -- every bit of this is true.
11
+ Soon after Tipper and I left the -- (Mock sob) White House -- (Laughter) we were driving from our home in Nashville to a little farm we have 50 miles east of Nashville.
12
+ Driving ourselves.
13
+ (Laughter) I know it sounds like a little thing to you, but -- (Laughter) I looked in the rear-view mirror and all of a sudden it just hit me.
14
+ There was no motorcade back there.
15
+ (Laughter) You've heard of phantom limb pain?
16
+ (Laughter) This was a rented Ford Taurus.
17
+ (Laughter) It was dinnertime, and we started looking for a place to eat.
18
+ We were on I-40.
19
+ We got to Exit 238, Lebanon, Tennessee.
20
+ We got off the exit, we found a Shoney's restaurant.
21
+ Low-cost family restaurant chain, for those of you who don't know it.
22
+ We went in and sat down at the booth, and the waitress came over, made a big commotion over Tipper.
23
+ (Laughter) She took our order, and then went to the couple in the booth next to us, and she lowered her voice so much, I had to really strain to hear what she was saying.
24
+ And she said "Yes, that's former Vice President Al Gore and his wife, Tipper."
25
+ And the man said, "He's come down a long way, hasn't he?"
26
+ (Laughter) (Applause) There's been kind of a series of epiphanies.
27
+ (Laughter) The very next day, continuing the totally true story, I got on a G-V to fly to Africa to make a speech in Nigeria, in the city of Lagos, on the topic of energy.
28
+ And I began the speech by telling them the story of what had just happened the day before in Nashville.
29
+ And I told it pretty much the same way I've just shared it with you: Tipper and I were driving ourselves, Shoney's, low-cost family restaurant chain, what the man said -- they laughed.
30
+ I gave my speech, then went back out to the airport to fly back home.
31
+ I fell asleep on the plane until, during the middle of the night, we landed on the Azores Islands for refueling.
32
+ I woke up, they opened the door, I went out to get some fresh air, and I looked, and there was a man running across the runway.
33
+
34
+ And he was waving a piece of paper, and he was yelling, "Call Washington!
35
+ Call Washington!"
36
+
37
+ And I thought to myself, in the middle of the night, in the middle of the Atlantic, what in the world could be wrong in Washington?
38
+ Then I remembered it could be a bunch of things.
39
+ (Laughter) But what it turned out to be, was that my staff was extremely upset because one of the wire services in Nigeria had already written a story about my speech, and it had already been printed in cities all across the United States of America.
40
+ It was printed in Monterey, I checked.
41
+
42
+ (Laughter) And the story began, "Former Vice President Al Gore announced in Nigeria yesterday," quote: 'My wife Tipper and I have opened a low-cost family restaurant'" -- (Laughter) "'named Shoney's, and we are running it ourselves.'"
43
+ (Laughter) Before I could get back to U.S. soil, David Letterman and Jay Leno had already started in on -- one of them had me in a big white chef's hat, Tipper was saying, "One more burger with fries!"
44
+
45
+ (Laughter) Three days later, I got a nice, long, handwritten letter from my friend and partner and colleague Bill Clinton, saying, "Congratulations on the new restaurant, Al!"
46
+ (Laughter) We like to celebrate each other's successes in life.
47
+ (Laughter) I was going to talk about information ecology.
48
+ But I was thinking that, since I plan to make a lifelong habit of coming back to TED, that maybe I could talk about that another time.
49
+ (Applause) Chris Anderson: It's a deal!
50
+ (Applause) Al Gore: I want to focus on what many of you have said you would like me to elaborate on: What can you do about the climate crisis?
51
+ I want to start with a couple of -- I'm going to show some new images, and I'm going to recapitulate just four or five.
52
+ Now, the slide show.
53
+ I update the slide show every time I give it.
54
+ I add new images, because I learn more about it every time I give it.
55
+ It's like beach-combing, you know?
56
+ Every time the tide comes in and out, you find some more shells.
57
+ Just in the last two days, we got the new temperature records in January.
58
+ This is just for the United States of America.
59
+ Historical average for Januarys is 31 degrees; last month was 39.5 degrees.
60
+ Now, I know that you wanted some more bad news about the environment -- I'm kidding.
61
+ But these are the recapitulation slides, and then I'm going to go into new material about what you can do.
62
+ But I wanted to elaborate on a couple of these.
63
+ First of all, this is where we're projected to go with the U.S. contribution to global warming, under business as usual.
64
+ Efficiency in end-use electricity and end-use of all energy is the low-hanging fruit.
65
+ Efficiency and conservation -- it's not a cost; it's a profit.
66
+ The sign is wrong.
67
+ It's not negative; it's positive.
68
+ These are investments that pay for themselves.
69
+ But they are also very effective in deflecting our path.
70
+ Cars and trucks -- I talked about that in the slideshow, but I want you to put it in perspective.
71
+ It's an easy, visible target of concern -- and it should be -- but there is more global warming pollution that comes from buildings than from cars and trucks.
72
+ Cars and trucks are very significant, and we have the lowest standards in the world.
73
+
74
+ And so we should address that.
75
+ But it's part of the puzzle.
76
+
77
+ Other transportation efficiency is as important as cars and trucks.
78
+ Renewables at the current levels of technological efficiency can make this much difference.
79
+ And with what Vinod, and John Doerr and others, many of you here -- there are a lot of people directly involved in this -- this wedge is going to grow much more rapidly than the current projection shows it.
80
+ Carbon Capture and Sequestration -- that's what CCS stands for -- is likely to become the killer app that will enable us to continue to use fossil fuels in a way that is safe.
81
+ Not quite there yet.
82
+
83
+ OK.
84
+ Now, what can you do?
85
+
86
+ Reduce emissions in your home.
87
+ Most of these expenditures are also profitable.
88
+ Insulation, better design.
89
+ Buy green electricity where you can.
90
+ I mentioned automobiles -- buy a hybrid.
91
+ Use light rail.
92
+ Figure out some of the other options that are much better.
93
+ It's important.
94
+ Be a green consumer.
95
+ You have choices with everything you buy, between things that have a harsh effect, or a much less harsh effect on the global climate crisis.
96
+ Consider this: Make a decision to live a carbon-neutral life.
97
+ Those of you who are good at branding, I'd love to get your advice and help on how to say this in a way that connects with the most people.
98
+ It is easier than you think.
99
+ It really is.
100
+ A lot of us in here have made that decision, and it is really pretty easy.
101
+ It means reduce your carbon dioxide emissions with the full range of choices that you make, and then purchase or acquire offsets for the remainder that you have not completely reduced.
102
+ And what it means is elaborated at climatecrisis.net.
103
+ There is a carbon calculator.
104
+ Participant Productions convened -- with my active involvement -- the leading software writers in the world, on this arcane science of carbon calculation, to construct a consumer-friendly carbon calculator.
105
+ You can very precisely calculate what your CO2 emissions are, and then you will be given options to reduce.
106
+ And by the time the movie comes out in May, this will be updated to 2.0, and we will have click-through purchases of offsets.
107
+ Next, consider making your business carbon-neutral.
108
+ Again, some of us have done that, and it's not as hard as you think.
109
+ Integrate climate solutions into all of your innovations, whether you are from the technology, or entertainment, or design and architecture community.
110
+ Invest sustainably.
111
+ Majora mentioned this.
112
+ Listen, if you have invested money with managers who you compensate on the basis of their annual performance, don't ever again complain about quarterly report CEO management.
113
+ Over time, people do what you pay them to do.
114
+ And if they judge how much they're going to get paid on your capital that they've invested, based on the short-term returns, you're going to get short-term decisions.
115
+ A lot more to be said about that.
116
+ Become a catalyst of change.
117
+ Teach others, learn about it, talk about it.
118
+ The movie is a movie version of the slideshow I gave two nights ago, except it's a lot more entertaining.
119
+ And it comes out in May.
120
+ Many of you here have the opportunity to ensure that a lot of people see it.
121
+ Consider sending somebody to Nashville.
122
+ Pick well.
123
+ And I am personally going to train people to give this slideshow -- re-purposed, with some of the personal stories obviously replaced with a generic approach, and it's not just the slides, it's what they mean.
124
+ And it's how they link together.
125
+ And so I'm going to be conducting a course this summer for a group of people that are nominated by different folks to come and then give it en masse, in communities all across the country, and we're going to update the slideshow for all of them every single week, to keep it right on the cutting edge.
126
+ Working with Larry Lessig, it will be, somewhere in that process, posted with tools and limited-use copyrights, so that young people can remix it and do it in their own way.
127
+ (Applause) Where did anybody get the idea that you ought to stay arm's length from politics?
128
+ It doesn't mean that if you're a Republican, that I'm trying to convince you to be a Democrat.
129
+ We need Republicans as well.
130
+ This used to be a bipartisan issue, and I know that in this group it really is.
131
+ Become politically active.
132
+ Make our democracy work the way it's supposed to work.
133
+ Support the idea of capping carbon dioxide emissions -- global warming pollution -- and trading it.
134
+ Here's why: as long as the United States is out of the world system, it's not a closed system.
135
+ Once it becomes a closed system, with U.S. participation, then everybody who's on a board of directors -- how many people here serve on the board of directors of a corporation?
136
+ Once it's a closed system, you will have legal liability if you do not urge your CEO to get the maximum income from reducing and trading the carbon emissions that can be avoided.
137
+ The market will work to solve this problem -- if we can accomplish this.
138
+ Help with the mass persuasion campaign that will start this spring.
139
+ We have to change the minds of the American people.
140
+ Because presently, the politicians do not have permission to do what needs to be done.
141
+ And in our modern country, the role of logic and reason no longer includes mediating between wealth and power the way it once did.
142
+ It's now repetition of short, hot-button, 30-second, 28-second television ads.
143
+ We have to buy a lot of those ads.
144
+ Let's re-brand global warming, as many of you have suggested.
145
+ I like "climate crisis" instead of "climate collapse," but again, those of you who are good at branding, I need your help on this.
146
+ Somebody said the test we're facing now, a scientist told me, is whether the combination of an opposable thumb and a neocortex is a viable combination.
147
+ (Laughter) That's really true.
148
+ I said the other night, and I'll repeat now: this is not a political issue.
149
+ Again, the Republicans here -- this shouldn't be partisan.
150
+ You have more influence than some of us who are Democrats do.
151
+ This is an opportunity.
152
+ Not just this, but connected to the ideas that are here, to bring more coherence to them.
153
+ We are one.
154
+ Thank you very much, I appreciate it.
155
+ (Applause)
tests/freeze_tokenizer.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Helper script do dump/freeze the current tokenizer"""
2
+
3
+ from tokenizers import Tokenizer
4
+ from pathlib import Path
5
+
6
+
7
+ HERE = Path(__file__).parent.absolute()
8
+
9
+
10
+ def main():
11
+ MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
12
+ tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
13
+ tokenizer.save((HERE.parent / "iscc_sct/tokenizer.json").as_posix(), pretty=False)
14
+
15
+
16
+ if __name__ == "__main__":
17
+ main()
tests/test_cli.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import pytest
3
+ import shutil
4
+
5
+ sct = shutil.which("sct")
6
+
7
+
8
+ @pytest.fixture
9
+ def sample_text_file(tmp_path):
10
+ file_path = tmp_path / "sample.txt"
11
+ file_path.write_text("This is a sample text for testing.")
12
+ return file_path
13
+
14
+
15
+ @pytest.fixture
16
+ def empty_text_file(tmp_path):
17
+ file_path = tmp_path / "empty.txt"
18
+ file_path.write_text(" ")
19
+ return file_path
20
+
21
+
22
+ @pytest.fixture
23
+ def non_utf8_text_file(tmp_path):
24
+ file_path = tmp_path / "non_utf8.txt"
25
+ file_path.write_text("Iñtërnâtiônàlizætiøn☃", encoding="utf-16")
26
+ return file_path
27
+
28
+
29
+ def test_cli_no_args():
30
+ result = subprocess.run([sct], capture_output=True, text=True)
31
+ assert result.returncode == 0
32
+ assert "Generate Semantic" in result.stdout
33
+
34
+
35
+ def test_cli_empty_file(empty_text_file):
36
+ result = subprocess.run([sct, str(empty_text_file), "-d"], capture_output=True, text=True)
37
+ assert result.returncode == 0
38
+ assert "SKIPPED" in result.stderr
39
+
40
+
41
+ def test_cli_non_utf8_file(non_utf8_text_file):
42
+ result = subprocess.run([sct, str(non_utf8_text_file), "-d"], capture_output=True, text=True)
43
+ assert result.returncode == 0
44
+ assert "Could not decode" in result.stderr
45
+ assert "ISCC:" in result.stdout
46
+
47
+
48
+ def test_cli_generate_sct(sample_text_file):
49
+ result = subprocess.run([sct, str(sample_text_file)], capture_output=True, text=True)
50
+ assert result.returncode == 0
51
+ assert "ISCC:" in result.stdout
52
+
53
+
54
+ def test_cli_generate_sct_granular(sample_text_file):
55
+ result = subprocess.run([sct, str(sample_text_file), "--granular"], capture_output=True, text=True)
56
+ assert result.returncode == 0
57
+ assert "features" in result.stdout
58
+
59
+
60
+ def test_cli_debug_mode(sample_text_file):
61
+ result = subprocess.run([sct, str(sample_text_file), "--debug"], capture_output=True, text=True)
62
+ assert result.returncode == 0
63
+ assert "DEBUG" in result.stderr
tests/test_demo.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from iscc_sct.demo import (
2
+ compute_iscc_code,
3
+ compare_codes,
4
+ hamming_to_cosine,
5
+ generate_similarity_bar,
6
+ )
7
+
8
+
9
+ def test_compute_iscc_code():
10
+ text1 = "Hello, world!"
11
+ text2 = "Hallo, Welt!"
12
+ bit_length = 64
13
+
14
+ result = compute_iscc_code(text1, text2, bit_length)
15
+ assert len(result) == 3
16
+ assert all(isinstance(code, str) for code in result[:2])
17
+ assert isinstance(result[2], str)
18
+
19
+
20
+ def test_compare_codes():
21
+ code_a = "ISCC:EAAQCVG2TABD6"
22
+ code_b = "ISCC:EAAQCVG2TABD6"
23
+ bits = 64
24
+
25
+ result = compare_codes(code_a, code_b, bits)
26
+ assert isinstance(result, str)
27
+ assert "100.00%" in result
28
+
29
+ result = compare_codes(None, code_b, bits)
30
+ assert result is None
31
+
32
+
33
+ def test_hamming_to_cosine():
34
+ assert hamming_to_cosine(0, 64) == 1.0
35
+ assert hamming_to_cosine(32, 64) == 0.0
36
+ assert hamming_to_cosine(64, 64) == -1.0
37
+
38
+
39
+ def test_generate_similarity_bar():
40
+ result = generate_similarity_bar(1.0)
41
+ assert "100.00%" in result
42
+ assert "green" in result
43
+
44
+ result = generate_similarity_bar(-0.5)
45
+ assert "-50.00%" in result
46
+ assert "red" in result
47
+
48
+
49
+ from unittest.mock import patch, MagicMock
50
+ import gradio as gr
51
+ from iscc_sct.demo import process_text
52
+
53
+
54
+ @patch("iscc_sct.demo.sct.gen_text_code_semantic")
55
+ def test_process_text(mock_gen_text_code):
56
+ mock_gen_text_code.return_value = {"iscc": "ISCC:EAAQCVG2TABD6"}
57
+
58
+ # Test with valid input
59
+ result = process_text("Hello, world!", 64, "a")
60
+ assert isinstance(result, dict)
61
+ assert len(result) == 1
62
+ key, value = next(iter(result.items()))
63
+ assert isinstance(key, gr.components.Textbox)
64
+ assert isinstance(value, gr.components.Textbox)
65
+ assert value.value == "ISCC:EAAQCVG2TABD6"
66
+
67
+ # Test with empty input
68
+ result = process_text("", 64, "b")
69
+ assert result is None
70
+
71
+ # Test with different bit length
72
+ process_text("Test", 128, "a")
73
+ mock_gen_text_code.assert_called_with("Test", bits=128)
74
+
75
+ # Test with different suffix
76
+ result = process_text("Test", 64, "b")
77
+ assert len(result) == 1
78
+ key, value = next(iter(result.items()))
79
+ assert isinstance(key, gr.components.Textbox)
80
+ assert isinstance(value, gr.components.Textbox)
tests/test_iscc_sct.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ from blake3 import blake3
5
+
6
+ import iscc_sct as sct
7
+ from iscc_sct.code_semantic_text import (
8
+ split_text,
9
+ tokenize_chunks,
10
+ embed_tokens,
11
+ embed_chunks,
12
+ compress,
13
+ )
14
+ import numpy as np
15
+
16
+
17
+ HERE = Path(__file__).parent.absolute()
18
+
19
+ TEXT = """
20
+ `iscc-sct` is a **proof of concept implementation** of a semantic Text-Code for the
21
+ [ISCC](https://core.iscc.codes) (*International Standard Content Code*). Semantic Text-Codes are
22
+ designed to capture and represent the language agnostic semantic content of text for improved
23
+ similarity detection.
24
+
25
+ The ISCC framework already comes with a Text-Code that is based on lexical similarity and can match
26
+ near duplicates. The ISCC Semantic Text-Code is planned as a new additional ISCC-UNIT focused on
27
+ capturing a more abstract and broad semantic similarity. As such the Semantic Text-Code is
28
+ engineered to be robust against a broader range of variations and translations of text that cannot
29
+ be matched based on lexical similarity.
30
+ """
31
+
32
+
33
+ def test_version():
34
+ assert sct.__version__ == "0.1.2"
35
+
36
+
37
+ def test_code_text_semantic_default():
38
+ fp = HERE / "en.txt"
39
+ result = sct.code_text_semantic(fp)
40
+ assert result == {
41
+ "iscc": "ISCC:CAA636IXQD736IGJ",
42
+ "characters": 12076,
43
+ }
44
+
45
+
46
+ def test_code_text_semantic_no_chars():
47
+ fp = HERE / "en.txt"
48
+ result = sct.code_text_semantic(fp, characters=False)
49
+ assert result == {"iscc": "ISCC:CAA636IXQD736IGJ"}
50
+
51
+
52
+ def test_code_text_semantic_embedding():
53
+ fp = HERE / "en.txt"
54
+ result = sct.code_text_semantic(fp, embedding=True)
55
+ assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
56
+ assert len(result["features"][0]["embedding"]) == 384
57
+
58
+
59
+ def test_code_text_semantic_features():
60
+ fp = HERE / "en.txt"
61
+ result = sct.code_text_semantic(fp, simprints=True)
62
+ assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
63
+ assert result["characters"] == 12076
64
+ assert result["features"][0]["simprints"][:3] == ["5wkXkfEx4lE", "b2UVwfc3wgk", "qvlV0W63s90"]
65
+ assert result["features"][0]["simprints"][-3:] == ["PNsX9eGZQEs", "fFk3M2u5Qkk", "TPuXs2sRtk8"]
66
+
67
+
68
+ def test_code_text_semantic_offsets():
69
+ fp = HERE / "en.txt"
70
+ result = sct.code_text_semantic(fp, offsets=True)
71
+ assert result["features"][0]["offsets"][:3] == [0, 277, 612]
72
+
73
+
74
+ def test_code_text_semantic_chunks():
75
+ fp = HERE / "en.txt"
76
+ result = sct.code_text_semantic(fp, contents=True)
77
+ assert len(result["features"][0]["contents"]) == 39
78
+ assert result["features"][0]["contents"][0].startswith("\n Thank ")
79
+ assert result["features"][0]["contents"][-1].endswith("(Applause)\n")
80
+
81
+
82
+ def test_code_text_semantic_sizes():
83
+ fp = HERE / "en.txt"
84
+ result = sct.code_text_semantic(fp, sizes=True)
85
+ # fmt: off
86
+ assert result["features"][0]["sizes"] == [
87
+ 440, 396, 431, 385, 440, 380, 406, 477, 415, 536, 280, 449, 446, 442, 443, 444, 451, 485,
88
+ 477, 439, 517, 430, 468, 394, 531, 448, 421, 503, 376, 403, 513, 477, 393, 375, 555, 533,
89
+ 312, 455, 413
90
+ ]
91
+ # fmt: on
92
+
93
+
94
+ def test_gen_text_code_semantic_empty():
95
+ with pytest.raises(ValueError) as excinfo:
96
+ sct.gen_text_code_semantic("")
97
+ assert str(excinfo.value) == "Input text cannot be empty."
98
+
99
+
100
+ def test_gen_text_code_semantic_granular():
101
+ result = sct.gen_text_code_semantic(
102
+ TEXT,
103
+ simprints=True,
104
+ offsets=True,
105
+ contents=True,
106
+ )
107
+ assert (
108
+ result
109
+ == {
110
+ "characters": 726,
111
+ "iscc": "ISCC:CAARISHPJHEXQAYL",
112
+ "features": [
113
+ {
114
+ "maintype": "semantic",
115
+ "subtype": "text",
116
+ "version": 0,
117
+ "simprints": ["FWjtTcl4Aws", "lAjHSc1wAws"],
118
+ "offsets": [0, 297],
119
+ "contents": [
120
+ "\n"
121
+ "`iscc-sct` is a **proof of concept implementation** of a semantic "
122
+ "Text-Code for the\n"
123
+ "[ISCC](https://core.iscc.codes) (*International Standard Content "
124
+ "Code*). Semantic Text-Codes are\n"
125
+ "designed to capture and represent the language agnostic semantic "
126
+ "content of text for improved\n"
127
+ "similarity detection.\n"
128
+ "\n", # NOTE: end of first chunk (see comma :)
129
+ "\n"
130
+ "\n"
131
+ "The ISCC framework already comes with a Text-Code that is based "
132
+ "on lexical similarity and can match\n"
133
+ "near duplicates. The ISCC Semantic Text-Code is planned as a new "
134
+ "additional ISCC-UNIT focused on\n"
135
+ "capturing a more abstract and broad semantic similarity. As such "
136
+ "the Semantic Text-Code is\n"
137
+ "engineered to be robust against a broader range of variations and "
138
+ "translations of text that cannot\n"
139
+ "be matched based on lexical similarity.\n",
140
+ ],
141
+ }
142
+ ],
143
+ }
144
+ )
145
+
146
+
147
+ def test_gen_text_code_semantic_checks_bits():
148
+ with pytest.raises(ValueError):
149
+ sct.gen_text_code_semantic("Test", bits=99)
150
+
151
+
152
+ def test_split_text(text_en):
153
+ chunks = split_text(text_en)
154
+ assert chunks[0][1][:8] == "\n Thank "
155
+ assert chunks[-1][1][:8] == "\n (Laugh"
156
+
157
+
158
+ def test_split_text_override():
159
+ text = "Try some very small and granular text splitting. Use options override for it."
160
+ chunks = split_text(text, max_tokens=8, overlap=4)
161
+ assert chunks == [
162
+ (0, "Try some very small and granular text "),
163
+ (20, "and granular text splitting. "),
164
+ (49, "Use options override for it."),
165
+ ]
166
+
167
+
168
+ def test_tokenize_chunks():
169
+ chunks = ["Hello World", "These are chunks"]
170
+ result = tokenize_chunks(chunks)
171
+ np.testing.assert_array_equal(
172
+ result["input_ids"],
173
+ np.array([[0, 35378, 6661, 2, 1, 1], [0, 32255, 621, 7839, 1224, 2]], dtype=np.int64),
174
+ )
175
+
176
+
177
+ def test_embed_tokens():
178
+ chunks = ["Hello World", "These are chunks"]
179
+ tokens = tokenize_chunks(chunks)
180
+ embeddings = embed_tokens(tokens)
181
+ assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)
182
+
183
+
184
+ def test_embed_chunks():
185
+ chunks = ["Hello World"]
186
+ expected = [0.008697219, 0.038051583, 0.043976285]
187
+ embeddings = embed_chunks(chunks)
188
+ assert list(embeddings[0][:3]) == pytest.approx(expected, rel=1e-3)
189
+
190
+
191
+ def test_gen_text_code_semantic(text_en):
192
+ result = sct.gen_text_code_semantic(text_en, embedding=True)
193
+ assert result["iscc"] == "ISCC:CAA636IXQD736IGJ"
194
+ assert result["features"][0]["embedding"][:3] == pytest.approx(
195
+ [0.03241169825196266, 0.022712377831339836, 0.050273094326257706],
196
+ rel=1e-3,
197
+ )
198
+
199
+
200
+ def test_cross_lingual_match(text_en, text_de):
201
+ a = sct.gen_text_code_semantic(text_en)["iscc"]
202
+ assert a == "ISCC:CAA636IXQD736IGJ"
203
+ b = sct.gen_text_code_semantic(text_de)["iscc"]
204
+ assert b == "ISCC:CAA636IXQD4TMIGL" # hamming distance for the codes is 6 bits
205
+
206
+
207
+ def test_tokenizer_integrity(text_en):
208
+ # test if updates break tokenizer compatibility
209
+ hasher = blake3()
210
+ for idx, chunk in split_text(text_en):
211
+ hasher.update(chunk.encode("utf-8"))
212
+ checksum = hasher.hexdigest()
213
+ assert checksum == "7a7ad1ce83c36f853d31390150403e225bac7825a5573dd5c9e326b0917c7b52"
214
+
215
+
216
+ def test_soft_hash_text_semantic():
217
+ result = sct.soft_hash_text_semantic("Hello World")
218
+ assert (
219
+ result.hex()
220
+ == "f36789d8d1bbe351106bdf8e9b5006a3fc4cb1eb4042c75ea26b5058857c9177705429237858e9940e133c8b12ee1a3d"
221
+ )
222
+
223
+
224
+ def test_shift_resistance(text_en):
225
+ a = sct.soft_hash_text_semantic(text_en)
226
+ shifted = "Just put another sentence in the begginging of the text!\n" + text_en
227
+ b = sct.soft_hash_text_semantic(shifted)
228
+ # TODO improve algorithm with more shift resistant semantic chunking
229
+ # On 256-bit code
230
+ assert sct.hamming_distance(a, b) == 6
231
+ # On 64-bit code
232
+ assert sct.hamming_distance(b[:16], a[:16]) == 1
233
+
234
+
235
+ def test_compress():
236
+ arr1 = np.array([3.0, 15294.7789, 32977.7])
237
+ arr2 = np.array([3.0, 15294.7789, 32977.7], dtype=np.float32)
238
+ expected = [3.0, 15294.8, 32977.7]
239
+ assert compress(arr1, 1) == expected
240
+ assert compress(arr2, 1) == expected
241
+
242
+
243
+ def test_embedding_precision():
244
+ d16 = sct.gen_text_code_semantic("Hello World", embedding=True, precision=4)
245
+ assert d16["features"][0]["embedding"][0] == 0.0087
tests/test_main.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import iscc_sct as sct
2
+
3
+
4
+ def test_create_returns_sct_meta():
5
+ result = sct.create("Hello World")
6
+ assert isinstance(result, sct.Metadata)
7
+
8
+
9
+ def test_create_default():
10
+ result = sct.create("Hello World")
11
+ assert result == sct.Metadata(iscc="ISCC:CAA7GZ4J3DI3XY2R", characters=11)
12
+
13
+
14
+ def test_create_granular():
15
+ result = sct.create("Hello World", granular=True)
16
+ assert result.model_dump(exclude_none=True) == {
17
+ "iscc": "ISCC:CAA7GZ4J3DI3XY2R",
18
+ "characters": 11,
19
+ "features": [
20
+ {
21
+ "maintype": "semantic",
22
+ "subtype": "text",
23
+ "version": 0,
24
+ "simprints": [{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}],
25
+ }
26
+ ],
27
+ }
28
+
29
+
30
+ def test_create_embedding():
31
+ result = sct.create("Hello World", embedding=True)
32
+ assert len(result.features[0].embedding) == 384
tests/test_models.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from pydantic import ValidationError
3
+ from iscc_sct.models import Metadata, Feature, FeatureSet
4
+
5
+
6
+ def test_feature_initialization():
7
+ # Test empty initialization
8
+ with pytest.raises(ValidationError):
9
+ Feature()
10
+ feature = Feature(simprint="XZjeSfdyVi0")
11
+ assert feature.simprint == "XZjeSfdyVi0"
12
+ assert feature.offset is None
13
+ assert feature.content is None
14
+
15
+ # Test initialization with values
16
+ feature = Feature(simprint="feature", offset=5, content="example text")
17
+ assert feature.simprint == "feature"
18
+ assert feature.offset == 5
19
+ assert feature.content == "example text"
20
+
21
+
22
+ def test_feature_set_initialization():
23
+ fs = FeatureSet()
24
+ assert fs.model_dump(exclude_none=True) == {"maintype": "semantic", "subtype": "text", "version": 0}
25
+
26
+
27
+ def test_sct_meta_initialization():
28
+ # Test initialization with minimal required fields
29
+ meta = Metadata(iscc="ISCC1234567890")
30
+ assert meta.iscc == "ISCC1234567890"
31
+ assert meta.characters is None
32
+ assert meta.features is None
33
+
34
+ # Test initialization with all fields
35
+ features = [FeatureSet(simprints=[Feature(simprint="feature1", offset=0, content="text1")], embedding=[0.1, 0.2])]
36
+ meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
37
+ assert meta.iscc == "ISCC1234567890"
38
+ assert meta.characters == 1000
39
+ assert meta.features == features
40
+ assert meta.features[0].embedding == [0.1, 0.2]
41
+
42
+
43
+ def test_metadata_to_index_format():
44
+ # Test conversion from Object-Format to Index-Format
45
+ features = [
46
+ FeatureSet(
47
+ simprints=[
48
+ Feature(simprint="feature1", offset=0, size=5, content="text1"),
49
+ Feature(simprint="feature2", offset=5, size=5, content="text2"),
50
+ ]
51
+ )
52
+ ]
53
+ meta = Metadata(iscc="ISCC1234567890", features=features)
54
+ index_meta = meta.to_index_format()
55
+ assert isinstance(index_meta.features[0].simprints[0], str)
56
+ assert index_meta.features[0].simprints == ["feature1", "feature2"]
57
+ assert index_meta.features[0].offsets == [0, 5]
58
+ assert index_meta.features[0].sizes == [5, 5]
59
+ assert index_meta.features[0].contents == ["text1", "text2"]
60
+
61
+ # Test that Index-Format remains unchanged
62
+ index_meta2 = index_meta.to_index_format()
63
+ assert index_meta2.model_dump() == index_meta.model_dump()
64
+
65
+
66
+ def test_metadata_to_object_format():
67
+ # Test conversion from Index-Format to Object-Format
68
+ features = [
69
+ FeatureSet(simprints=["feature1", "feature2"], offsets=[0, 5], sizes=[5, 5], contents=["text1", "text2"])
70
+ ]
71
+ meta = Metadata(iscc="ISCC1234567890", features=features)
72
+ object_meta = meta.to_object_format()
73
+ assert isinstance(object_meta.features[0].simprints[0], Feature)
74
+ assert object_meta.features[0].simprints[0].simprint == "feature1"
75
+ assert object_meta.features[0].simprints[0].offset == 0
76
+ assert object_meta.features[0].simprints[0].size == 5
77
+ assert object_meta.features[0].simprints[0].content == "text1"
78
+ assert object_meta.features[0].offsets is None
79
+ assert object_meta.features[0].sizes is None
80
+ assert object_meta.features[0].contents is None
81
+
82
+ # Test that Object-Format remains unchanged
83
+ object_meta2 = object_meta.to_object_format()
84
+ assert object_meta2.model_dump() == object_meta.model_dump()
85
+
86
+
87
+ def test_metadata_to_index_format_with_none_simprints():
88
+ # Test conversion when feature_set.simprints is None
89
+ features = [FeatureSet(simprints=None, embedding=[0.1, 0.2])]
90
+ meta = Metadata(iscc="ISCC1234567890", features=features)
91
+ index_meta = meta.to_index_format()
92
+ assert index_meta.features[0].simprints is None
93
+ assert index_meta.features[0].embedding == [0.1, 0.2]
94
+ assert index_meta.model_dump() == meta.model_dump()
95
+
96
+
97
+ def test_metadata_format_conversion_with_no_features():
98
+ meta = Metadata(iscc="ISCC1234567890")
99
+ index_meta = meta.to_index_format()
100
+ object_meta = meta.to_object_format()
101
+ assert index_meta.model_dump() == meta.model_dump()
102
+ assert object_meta.model_dump() == meta.model_dump()
tests/test_readme.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import doctest
2
+ from pathlib import Path
3
+
4
+ README = Path(__file__).parent.parent / "README.md"
5
+
6
+
7
+ def test_readme_examples():
8
+ failure_count, test_count = doctest.testfile(
9
+ README.as_posix(), module_relative=False, optionflags=doctest.ELLIPSIS, raise_on_error=False
10
+ )
11
+ assert failure_count == 0, f"{failure_count} out of {test_count} doctests failed"
tests/test_utils.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import iscc_sct as sct
3
+ from iscc_sct import utils
4
+ from blake3 import blake3
5
+
6
+
7
+ def test_check_integrity(tmp_path):
8
+ # Create a temporary file with known content
9
+ file_path = tmp_path / "testfile.txt"
10
+ content = "This is a test file."
11
+ with open(file_path, "w") as f:
12
+ f.write(content)
13
+
14
+ # Generate a correct checksum and then alter it to simulate failure
15
+ hasher = blake3()
16
+ hasher.update(content.encode())
17
+ correct_checksum = hasher.hexdigest()
18
+ assert utils.check_integrity(file_path, correct_checksum) == file_path
19
+
20
+ wrong_checksum = correct_checksum + "wrong" # Deliberately incorrect checksum
21
+
22
+ # Test the function with the wrong checksum
23
+ with pytest.raises(RuntimeError) as exc_info:
24
+ utils.check_integrity(file_path, wrong_checksum)
25
+
26
+ # Check that the exception message contains expected text
27
+ assert "Failed integrity check" in str(exc_info.value)
28
+
29
+
30
+ def test_hamming_distance_identical():
31
+ a = b"abc"
32
+ b = b"abc"
33
+ assert utils.hamming_distance(a, b) == 0
34
+
35
+
36
+ def test_hamming_distance_different():
37
+ a = b"abc"
38
+ b = b"abd"
39
+ assert utils.hamming_distance(a, b) == 3
40
+
41
+
42
+ def test_hamming_distance_completely_different():
43
+ a = b"\x00"
44
+ b = b"\xff"
45
+ assert utils.hamming_distance(a, b) == 8
46
+
47
+
48
+ def test_hamming_distance_raises_value_error():
49
+ a = b"abc"
50
+ b = b"abcd"
51
+ with pytest.raises(ValueError):
52
+ utils.hamming_distance(a, b)
53
+
54
+
55
+ def test_encode_decode_base32():
56
+ original = b"Hello, World!"
57
+ encoded = utils.encode_base32(original)
58
+ assert isinstance(encoded, str)
59
+ assert encoded == "JBSWY3DPFQQFO33SNRSCC"
60
+ decoded = utils.decode_base32(encoded)
61
+ assert isinstance(decoded, bytes)
62
+ assert decoded == original
63
+
64
+
65
+ def test_encode_decode_base64():
66
+ original = b"Hello, World!"
67
+ encoded = utils.encode_base64(original)
68
+ assert isinstance(encoded, str)
69
+ assert encoded == "SGVsbG8sIFdvcmxkIQ"
70
+ decoded = utils.decode_base64(encoded)
71
+ assert isinstance(decoded, bytes)
72
+ assert decoded == original
73
+
74
+
75
+ def test_encode_decode_edge_cases():
76
+ # Test empty input
77
+ assert utils.encode_base32(b"") == ""
78
+ assert utils.decode_base32("") == b""
79
+ assert utils.encode_base64(b"") == ""
80
+ assert utils.decode_base64("") == b""
81
+
82
+ # Test input with padding
83
+ original = b"a"
84
+ assert utils.decode_base32(utils.encode_base32(original)) == original
85
+ assert utils.decode_base64(utils.encode_base64(original)) == original
86
+
87
+
88
+ def test_iscc_distance_different_lengths():
89
+ iscc1 = sct.create("Hello", bits=64).iscc
90
+ iscc2 = sct.create("Hello", bits=96).iscc
91
+ with pytest.raises(ValueError, match="The input ISCCs must have the same length"):
92
+ utils.iscc_distance(iscc1, iscc2)
tests/visualize.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from html import escape
2
+ import iscc_sct as ict
3
+
4
+
5
+ def generate_html(fingerprint_data):
6
+ chunks = fingerprint_data["features"]
7
+
8
+ # Sort chunks by offset
9
+ chunks.sort(key=lambda x: x["offset"])
10
+
11
+ html_content = f"""
12
+ <!DOCTYPE html>
13
+ <html lang="en">
14
+ <head>
15
+ <meta charset="UTF-8">
16
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
17
+ <title>Text Fingerprint Visualization</title>
18
+ <script src="https://cdn.tailwindcss.com"></script>
19
+ </head>
20
+ <body class="bg-gray-100 p-8">
21
+ <div class="max-w-4xl mx-auto bg-white p-6 rounded-lg shadow-lg">
22
+ <h1 class="text-2xl font-bold mb-4">Text Fingerprint Visualization</h1>
23
+ <div class="text-sm mb-4">
24
+ <span class="font-semibold">ISCC:</span> {fingerprint_data['iscc']}
25
+ </div>
26
+ <div class="text-sm mb-4">
27
+ <span class="font-semibold">Characters:</span> {fingerprint_data['characters']}
28
+ </div>
29
+ <div class="relative text-base leading-relaxed whitespace-pre-wrap">
30
+ """
31
+
32
+ chunk_color = "bg-yellow-100"
33
+ overlap_color = "bg-red-100"
34
+
35
+ current_pos = 0
36
+ for i, chunk in enumerate(chunks):
37
+ start = max(chunk["offset"], current_pos)
38
+ end = chunk["offset"] + chunk["size"]
39
+
40
+ if start < end:
41
+ # Function to escape text and preserve line breaks
42
+ def escape_and_preserve_breaks(text):
43
+ return escape(text).replace("\n", "<br>")
44
+
45
+ # Non-overlapping part
46
+ html_content += f'<span class="{overlap_color}">{escape_and_preserve_breaks(chunk["text"][current_pos - chunk["offset"]:start - chunk["offset"]])}'
47
+
48
+ # Overlapping part (if any)
49
+ if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
50
+ overlap_end = chunks[i + 1]["offset"]
51
+ html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
52
+ html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :])
53
+ else:
54
+ html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
55
+
56
+ # Fingerprint badge
57
+ html_content += f'<span class="inline-block bg-gray-800 text-white text-xs px-2 py-1 rounded ml-1">{chunk["feature"]}</span>'
58
+
59
+ html_content += "</span>"
60
+
61
+ current_pos = end
62
+
63
+ html_content += """
64
+ </div>
65
+ </div>
66
+ </body>
67
+ </html>
68
+ """
69
+ return html_content
70
+
71
+
72
+ def main():
73
+ with open("../README.md", "rb") as f:
74
+ data = f.read()
75
+
76
+ text = data.decode("utf-8")
77
+
78
+ result = ict.create(text, granular=True)
79
+ print(result.model_dump())
80
+
81
+ # Generate the HTML content
82
+ html_content = generate_html(result.model_dump())
83
+
84
+ # Write the HTML content to a file
85
+ with open("readme.html", "wt", encoding="utf-8") as f:
86
+ f.write(html_content)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()