Spaces:

iscc
/

iscc-sct

Running

App Files Files Community

titusz commited on Aug 19, 2024

Commit

73ab668

verified ·

1 Parent(s): 63ba62d

Synced repo using 'sync_with_huggingface' Github Action

Browse files

Files changed (16) hide show

.editorconfig +1 -1
iscc_sct/cli.py +9 -3
iscc_sct/code_semantic_text.py +6 -2
iscc_sct/demo.py +69 -96
iscc_sct/dev.py +20 -0
iscc_sct/models.py +18 -6
iscc_sct/options.py +21 -7
iscc_sct/samples.yml +105 -0
poetry.lock +1 -1
pyproject.toml +5 -3
tests/benchmark.py +6 -2
tests/test_cli.py +3 -1
tests/test_iscc_sct.py +3 -1
tests/test_main.py +3 -1
tests/test_models.py +17 -3
tests/visualize.py +3 -1

.editorconfig CHANGED Viewed

@@ -12,7 +12,7 @@ indent_size = 4
 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
-max_line_length = 119
 # YAML files

 end_of_line = lf
 insert_final_newline = true
 trim_trailing_whitespace = true
+max_line_length = 100
 # YAML files

iscc_sct/cli.py CHANGED Viewed

@@ -8,9 +8,15 @@ from charset_normalizer import from_bytes
 def main():
     parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
-    parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
-    parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
-    parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
     parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
     args = parser.parse_args()

 def main():
     parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
+    parser.add_argument(
+        "path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
+    )
+    parser.add_argument(
+        "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
+    )
+    parser.add_argument(
+        "-g", "--granular", action="store_true", help="Activate granular processing."
+    )
     parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
     args = parser.parse_args()

iscc_sct/code_semantic_text.py CHANGED Viewed

@@ -233,11 +233,15 @@ def model():
     so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
     try:
         with sct.timer("ONNXMODEL load time"):
-            return rt.InferenceSession(sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers)
     except NoSuchFile:  # pragma: no cover
         with sct.timer("ONNXMODEL aquisition/load time"):
             model_path = sct.get_model()
-            return rt.InferenceSession(model_path, sess_options=so, providers=selected_onnx_providers)
 def tokenize_chunks(chunks):

     so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
     try:
         with sct.timer("ONNXMODEL load time"):
+            return rt.InferenceSession(
+                sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers
+            )
     except NoSuchFile:  # pragma: no cover
         with sct.timer("ONNXMODEL aquisition/load time"):
             model_path = sct.get_model()
+            return rt.InferenceSession(
+                model_path, sess_options=so, providers=selected_onnx_providers
+            )
 def tokenize_chunks(chunks):

iscc_sct/demo.py CHANGED Viewed

@@ -6,6 +6,7 @@ from loguru import logger as log
 import gradio as gr
 import iscc_sct as sct
 import textwrap
 newline_symbols = {
@@ -84,7 +85,9 @@ def generate_similarity_bar(similarity):
     # Adjust the text position to be centered within the colored bar
     text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
-    text_alignment = "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
     bar_html = f"""
     <h3>Semantic Similarity</h3>
@@ -97,66 +100,12 @@ def generate_similarity_bar(similarity):
     return bar_html
-# Sample texts
-sample_text_en = "\n\n".join(
-    [
-        " ".join(paragraph.split())
-        for paragraph in """
-This document specifies the syntax and structure of the International Standard Content Code (ISCC),
-as an identification system for digital assets (including encodings of text, images, audio, video or other content
-across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
-as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
-An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
-digests using the algorithms and rules in this document. This document does not provide information on registration of
-ISCCs.
-""".strip().split("\n\n")
-    ]
-)
-sample_text_de = "\n\n".join(
-    [
-        " ".join(paragraph.split())
-        for paragraph in """
-Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
-Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
-Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
-anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
-Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
-Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
-keine Informationen über die Registrierung von ISCCs.
-""".strip().split("\n\n")
-    ]
-)
-sample_text_bg = "\n\n".join(
-    [
-        " ".join(paragraph.split())
-        for paragraph in """
-Този документ определя синтаксиса и структурата на Международния стандартен код на съдържанието (ISCC) като система за
-идентификация на цифрови активи (включително кодиране на текст, изображения, аудио, видео или друго съдържание във
-всички медийни сектори). Той описва също метаданните на ISCC и използването на ISCC във връзка с други схеми, като
-DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
-ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор на данни, конструиран от множество
-хеш-разходи, като се използват алгоритмите и правилата в настоящия документ. Настоящият документ не предоставя
-информация за регистрацията на ISCC.
-""".strip().split("\n\n")
-    ]
-)
-sample_text_zh = "\n\n".join(
-    [
-        " ".join(paragraph.split())
-        for paragraph in """
-本文件规定了国际标准内容代码（ISCC）的语法和结构，作为数字资产（包括所有媒 体领域的文本、图像、音频、视频或其他内容的编码）的标识系统。它还介绍了
-ISCC 元数据以及 ISCC 与其他方案（如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC）的结合使用。
-ISCC 适用于特定的数字资产，是使用本文件中的算法和规则从多个哈希摘要中确定性地建 立起来的数据描述符。本文件不提供有关 ISCC 注册的信息。
-""".strip().split("\n\n")
-    ]
-)
 custom_css = """
 """
@@ -176,67 +125,81 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
         """,
         )
     with gr.Row(variant="panel"):
-        in_iscc_bits = gr.Slider(
-            label="ISCC Bit-Length",
-            info="NUMBER OF BITS FOR OUTPUT ISCC",
-            minimum=64,
-            maximum=256,
-            step=32,
-            value=128,
-        )
     with gr.Row(variant="panel"):
         with gr.Column(variant="panel"):
             in_text_a = gr.TextArea(
                 label="Text A",
-                placeholder="Choose sample text from the dropdown or type or paste your text.",
                 lines=12,
                 max_lines=12,
             )
-            sample_dropdown_a = gr.Dropdown(
-                choices=["None", "English", "Bulgarian"], label="Select sample for Text A", value="None"
-            )
             out_code_a = gr.Textbox(label="ISCC Code for Text A")
-            out_chunks_a = gr.HighlightedText(
-                label="Chunked Text A",
-                interactive=False,
-                elem_id="chunked-text-a",
-            )
         with gr.Column(variant="panel"):
             in_text_b = gr.TextArea(
                 label="Text B",
-                placeholder="Choose sample text from the dropdown or type or paste your text.",
                 lines=12,
                 max_lines=12,
             )
-            sample_dropdown_b = gr.Dropdown(
-                choices=["None", "German", "Chinese"], label="Select sample for Text B", value="None"
-            )
             out_code_b = gr.Textbox(label="ISCC Code for Text B")
             out_chunks_b = gr.HighlightedText(
                 label="Chunked Text B",
                 interactive=False,
                 elem_id="chunked-text-b",
             )
-    def update_sample_text(choice, text_a_or_b):
         if choice == "None":
             return ""
-        if text_a_or_b == "A":
-            return sample_text_en if choice == "English" else sample_text_bg
-        else:
-            return sample_text_de if choice == "German" else sample_text_zh
     sample_dropdown_a.change(
-        lambda choice: update_sample_text(choice, "A"), inputs=[sample_dropdown_a], outputs=[in_text_a]
     )
     sample_dropdown_b.change(
-        lambda choice: update_sample_text(choice, "B"), inputs=[sample_dropdown_b], outputs=[in_text_b]
     )
-    with gr.Row(variant="panel"):
-        with gr.Column(variant="panel"):
-            out_similarity = gr.HTML(label="Similarity")
     def process_text(text, nbits, suffix):
         log.debug(f"{text[:20]}")
         out_code_func = globals().get(f"out_code_{suffix}")
@@ -248,7 +211,9 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
                 out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
             }
-        result = sct.gen_text_code_semantic(text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True)
         iscc = sct.Metadata(**result).to_object_format()
         # Generate chunked text with simprints and overlaps
@@ -318,14 +283,22 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
         show_progress="full",
     )
-    out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
-    out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
     def reset_all():
         return (
             gr.Slider(value=128),  # Reset ISCC Bit-Length
-            gr.Dropdown(value="None"),  # Reset sample dropdown A
-            gr.Dropdown(value="None"),  # Reset sample dropdown B
             gr.TextArea(value=""),  # Reset Text A
             gr.TextArea(value=""),  # Reset Text B
             gr.Textbox(value=""),  # Reset ISCC Code for Text A

 import gradio as gr
 import iscc_sct as sct
 import textwrap
+import yaml
 newline_symbols = {
     # Adjust the text position to be centered within the colored bar
     text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
+    text_alignment = (
+        "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
+    )
     bar_html = f"""
     <h3>Semantic Similarity</h3>
     return bar_html
+def load_samples():
+    with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
+        return yaml.safe_load(file)["samples"]
+samples = load_samples()
 custom_css = """
 """
         """,
         )
     with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            sample_dropdown_a = gr.Dropdown(
+                choices=["None"] + [lang for lang in samples["a"]],
+                label="Select sample for Text A",
+                value="None",
+            )
+        with gr.Column(variant="panel"):
+            sample_dropdown_b = gr.Dropdown(
+                choices=["None"] + [lang for lang in samples["b"]],
+                label="Select sample for Text B",
+                value="None",
+            )
     with gr.Row(variant="panel"):
         with gr.Column(variant="panel"):
             in_text_a = gr.TextArea(
                 label="Text A",
+                placeholder="Choose sample text from the dropdown above or type or paste your text.",
                 lines=12,
                 max_lines=12,
             )
             out_code_a = gr.Textbox(label="ISCC Code for Text A")
         with gr.Column(variant="panel"):
             in_text_b = gr.TextArea(
                 label="Text B",
+                placeholder="Choose sample text from the dropdown above or type or paste your text.",
                 lines=12,
                 max_lines=12,
             )
             out_code_b = gr.Textbox(label="ISCC Code for Text B")
+    with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            out_similarity = gr.HTML(label="Similarity")
+    with gr.Row(variant="panel"):
+        in_iscc_bits = gr.Slider(
+            label="ISCC Bit-Length",
+            info="NUMBER OF BITS FOR OUTPUT ISCC",
+            minimum=64,
+            maximum=256,
+            step=32,
+            value=64,
+        )
+    with gr.Row(variant="panel"):
+        with gr.Column(variant="panel"):
+            out_chunks_a = gr.HighlightedText(
+                label="Chunked Text A",
+                interactive=False,
+                elem_id="chunked-text-a",
+            )
+        with gr.Column(variant="panel"):
             out_chunks_b = gr.HighlightedText(
                 label="Chunked Text B",
                 interactive=False,
                 elem_id="chunked-text-b",
             )
+    def update_sample_text(choice, group):
         if choice == "None":
             return ""
+        return samples[group][choice]
     sample_dropdown_a.change(
+        lambda choice: update_sample_text(choice, "a"),
+        inputs=[sample_dropdown_a],
+        outputs=[in_text_a],
     )
     sample_dropdown_b.change(
+        lambda choice: update_sample_text(choice, "b"),
+        inputs=[sample_dropdown_b],
+        outputs=[in_text_b],
     )
     def process_text(text, nbits, suffix):
         log.debug(f"{text[:20]}")
         out_code_func = globals().get(f"out_code_{suffix}")
                 out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
             }
+        result = sct.gen_text_code_semantic(
+            text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
+        )
         iscc = sct.Metadata(**result).to_object_format()
         # Generate chunked text with simprints and overlaps
         show_progress="full",
     )
+    out_code_a.change(
+        compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
+    )
+    out_code_b.change(
+        compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
+    )
     def reset_all():
         return (
             gr.Slider(value=128),  # Reset ISCC Bit-Length
+            gr.Dropdown(
+                value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
+            ),  # Reset sample dropdown A
+            gr.Dropdown(
+                value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
+            ),  # Reset sample dropdown B
             gr.TextArea(value=""),  # Reset Text A
             gr.TextArea(value=""),  # Reset Text B
             gr.Textbox(value=""),  # Reset ISCC Code for Text A

iscc_sct/dev.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import pathlib
 HERE = pathlib.Path(__file__).parent.absolute()
@@ -20,3 +21,22 @@ def convert_lf():  # pragma: no cover
                     outfile.write(content)
                 n += 1
     print(f"{n} files converted to LF")

 import pathlib
+import yaml
 HERE = pathlib.Path(__file__).parent.absolute()
                     outfile.write(content)
                 n += 1
     print(f"{n} files converted to LF")
+def format_yml():
+    for f in HERE.glob("**\*.yml"):
+        with open(f, "rt", encoding="utf-8") as infile:
+            data = yaml.safe_load(infile)
+        with open(f, "wt", encoding="utf-8", newline="\n") as outf:
+            yaml.safe_dump(
+                data,
+                outf,
+                indent=2,
+                width=80,
+                encoding="utf-8",
+                sort_keys=False,
+                default_flow_style=False,
+                default_style=">",
+                allow_unicode=True,
+                line_break="\n",
+            )

iscc_sct/models.py CHANGED Viewed

@@ -82,7 +82,9 @@ class PrettyBaseModel(BaseModel):
         return self.pretty_repr()
     def pretty_repr(self):
-        return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)
 class Feature(PrettyBaseModel):
@@ -132,9 +134,15 @@ class Metadata(PrettyBaseModel):
                 new_features.append(new_feature_set)
             else:
                 new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
-                new_feature_set.offsets = [f.offset for f in feature_set.simprints if f.offset is not None]
-                new_feature_set.sizes = [f.size for f in feature_set.simprints if f.size is not None]
-                new_feature_set.contents = [f.content for f in feature_set.simprints if f.content is not None]
                 new_features.append(new_feature_set)
         return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
@@ -154,7 +162,9 @@ class Metadata(PrettyBaseModel):
             # Convert to object format if in index format
             feature_set = self.to_object_format().features[0]
-        if not all(feature.content and feature.offset is not None for feature in feature_set.simprints):
             return None
         # Sort features by offset
@@ -191,7 +201,9 @@ class Metadata(PrettyBaseModel):
             # Convert to object format if in index format
             feature_set = self.to_object_format().features[0]
-        if not all(feature.content and feature.offset is not None for feature in feature_set.simprints):
             return []
         # Sort features by offset

         return self.pretty_repr()
     def pretty_repr(self):
+        return self.model_dump_json(
+            indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False
+        )
 class Feature(PrettyBaseModel):
                 new_features.append(new_feature_set)
             else:
                 new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
+                new_feature_set.offsets = [
+                    f.offset for f in feature_set.simprints if f.offset is not None
+                ]
+                new_feature_set.sizes = [
+                    f.size for f in feature_set.simprints if f.size is not None
+                ]
+                new_feature_set.contents = [
+                    f.content for f in feature_set.simprints if f.content is not None
+                ]
                 new_features.append(new_feature_set)
         return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
             # Convert to object format if in index format
             feature_set = self.to_object_format().features[0]
+        if not all(
+            feature.content and feature.offset is not None for feature in feature_set.simprints
+        ):
             return None
         # Sort features by offset
             # Convert to object format if in index format
             feature_set = self.to_object_format().features[0]
+        if not all(
+            feature.content and feature.offset is not None for feature in feature_set.simprints
+        ):
             return []
         # Sort features by offset

iscc_sct/options.py CHANGED Viewed

@@ -29,15 +29,27 @@ class SctOptions(BaseSettings):
         multiple_of=32,
     )
-    characters: bool = Field(True, description="ISCC_SCT_CHARACTERS - Include document character count")
-    embedding: bool = Field(False, description="ISCC_SCT_EMBEDDING - Include global document embedding")
-    precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")
-    simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
-    offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")
-    sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")
     contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
@@ -52,7 +64,9 @@ class SctOptions(BaseSettings):
         description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
     )
-    trim: bool = Field(False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)")
     model_config = SettingsConfigDict(
         env_file=".env",

         multiple_of=32,
     )
+    characters: bool = Field(
+        True, description="ISCC_SCT_CHARACTERS - Include document character count"
+    )
+    embedding: bool = Field(
+        False, description="ISCC_SCT_EMBEDDING - Include global document embedding"
+    )
+    precision: int = Field(
+        8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)"
+    )
+    simprints: bool = Field(
+        False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints"
+    )
+    offsets: bool = Field(
+        False, description="ISCC_SCT_OFFSETS - Include offsets of granular features"
+    )
+    sizes: bool = Field(
+        False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)"
+    )
     contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
         description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
     )
+    trim: bool = Field(
+        False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)"
+    )
     model_config = SettingsConfigDict(
         env_file=".env",

iscc_sct/samples.yml ADDED Viewed

	@@ -0,0 +1,105 @@

+"samples":
+  "a":
+    "English": >
+      This document specifies the syntax and structure of the International Standard
+      Content Code (ISCC), as an identification system for digital assets (including
+      encodings of text, images, audio, video or other content across all media sectors).
+      It also describes ISCC metadata and the use of ISCC in conjunction with other
+      schemes, such as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
+      An ISCC applies to a specific digital asset and is a data-descriptor deterministically
+      constructed from multiple hash digests using the algorithms and rules in this
+      document. This document does not provide information on registration of ISCCs.
+    "Hungarian": >
+      Ez a dokumentum meghatározza a Nemzetközi Szabványos Tartalomkód (ISCC) szintaxisát
+      és szerkezetét, amely a digitális eszközök azonosítási rendszere (beleértve
+      a szövegek, képek, hang-, videó- vagy egyéb tartalmak kódolását minden médiaszektorban).
+      Leírja továbbá az ISCC metaadatokat és az ISCC használatát más sémákkal, mint
+      például a DOI, ISAN, ISBN, ISRC, ISSN és ISWC.
+      Az ISCC egy adott digitális eszközre vonatkozik, és több hash kivonatból determinisztikusan
+      összeállított adatleíró a jelen dokumentumban szereplő algoritmusok és szabályok
+      alapján. Ez a dokumentum nem ad tájékoztatást az ISCC-k regisztrációjáról.
+    "Bulgarian": >
+      Този документ определя синтаксиса и структурата на Международния стандартен
+      код на съдържанието (ISCC) като система за идентификация на цифрови активи (включително
+      кодиране на текст, изображения, аудио, видео или друго съдържание във всички
+      медийни сектори). Той описва също метаданните на ISCC и използването на ISCC
+      във връзка с други схеми, като DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
+      ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор
+      на данни, конструиран от множество хеш-разходи, като се използват алгоритмите
+      и правилата в настоящия документ. Настоящият документ не предоставя информация
+      за регистрацията на ISCC.
+    "Finnish": >
+      Tässä asiakirjassa määritellään ISCC:n (International Standard Content Code)
+      syntaksi ja rakenne digitaalisen omaisuuden tunnistusjärjestelmänä (mukaan lukien
+      tekstin, kuvien, äänen, videon tai muun sisällön koodaukset kaikilla mediasektoreilla).
+      Siinä kuvataan myös ISCC-metatiedot ja ISCC:n käyttö muiden järjestelmien, kuten
+      DOI, ISAN, ISBN, ISRC, ISSN ja ISWC, kanssa.
+      ISCC koskee tiettyä digitaalista omaisuutta, ja se on datakuvaaja, joka on deterministisesti
+      muodostettu useista hajautuskoosteista käyttämällä tämän asiakirjan algoritmeja
+      ja sääntöjä. Tämä asiakirja ei sisällä tietoja ISCC:iden rekisteröinnistä.
+    "Arabic": >
+      تحدد هذه الوثيقة بناء جملة وبنية كود المحتوى القياسي الدولي (ISCC)، كنظام تعريف
+      للأصول الرقمية (بما في ذلك ترميز النصوص أو الصور أو الصوت أو الفيديو أو أي محتوى
+      آخر عبر جميع قطاعات الوسائط). ويصف أيضًا بيانات تعريف ISCC واستخدام ISCC بالتزامن
+      مع مخططات أخرى، مثل DOI وISAN وISBN وISRC وISSN وISWC.
+      ينطبق ISCC على أصل رقمي محدد وهو عبارة عن واصف بيانات تم إنشاؤه بشكل حتمي من
+      ملخصات تجزئة متعددة باستخدام الخوارزميات والقواعد الواردة في هذه الوثيقة. لا
+      تقدم هذه الوثيقة معلومات عن تسجيل ISCCs.
+  "b":
+    "Paraphrased": >
+      In order to identify digital assets (such as encodings of text, images, music,
+      video, and other content across all media sectors), this paper outlines the
+      syntax and structure of the International Standard Content Code (ISCC). Additionally,
+      it explains how to use ISCC metadata and how to combine it with other schemes
+      like DOI, ISAN, ISBN, ISRC, ISSN, and ISWC.
+      An ISCC is a data-descriptor that is applied to a particular digital asset and
+      is deterministically built from several hash digests utilizing the rules and
+      techniques in this specification. There is no information about ISCC registration
+      in this paper.
+    "German": >
+      Dieses Dokument spezifiziert die Syntax und Struktur des International Standard
+      Content Code (ISCC) als Identifizierungssystem für digitale Inhalte (einschließlich
+      Kodierungen von Text, Bildern, Audio, Video oder anderen Inhalten in allen Medienbereichen).
+      Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung
+      mit anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
+      Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor,
+      der deterministisch aus mehreren Hash-Digests unter Verwendung der Algorithmen
+      und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält keine Informationen
+      über die Registrierung von ISCCs.
+    "Chinese": >
+      本文件規定了國際標準內容編碼 (ISCC) 的語法和結構，作為數位資產 (包括所有媒體領域的文字、影像、音訊、視訊或其他內容的編碼) 的識別系統。它還介紹了
+      ISCC 元資料以及 ISCC 與其他方案（如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC）的結合使用。
+      ISCC 適用於特定的數位資產，是使用本文件中的演算法和規則，由多個雜湊摘要（hash digests）確定地建構出來的資料描述符（data-descriptor）。本文件不提供
+      ISCC 的註冊資訊。
+    "Korean": >
+      이 문서는 디지털 자산(모든 미디어 부문의 텍스트, 이미지, 오디오, 비디오 또는 기타 콘텐츠의 인코딩 포함)에 대한 식별 시스템인 ISCC(국제
+      표준 콘텐츠 코드)의 구문과 구조를 지정합니다. 또한 ISCC 메타데이터와 DOI, ISAN, ISBN, ISRC, ISSN 및 ISWC와
+      같은 다른 체계와 함께 ISCC를 사용하는 방법에 대해 설명합니다.
+      ISCC는 특정 디지털 자산에 적용되며 이 문서의 알고리즘과 규칙을 사용하여 여러 해시 다이제스트에서 결정론적으로 구성된 데이터 설명자입니다.
+      이 문서는 ISCC 등록에 대한 정보를 제공하지 않습니다.
+    "Japanese": >
+      この文書は、デジタル資産 (すべてのメディア セクターにわたるテキスト、画像、オーディオ、ビデオ、またはその他のコンテンツのエンコードを含む) の識別システムとして、国際標準コンテンツ
+      コード (ISCC) の構文と構造を指定します。また、ISCC メタデータと、DOI、ISAN、ISBN、ISRC、ISSN、ISWC などの他のスキームと組み合わせた
+      ISCC の使用についても説明します。
+      ISCC は特定のデジタル資産に適用され、本書のアルゴリズムとルールを使用して複数のハッシュ ダイジェストから決定論的に構築されるデータ記述子です。この文書には、ISCC
+      の登録に関する情報は記載されていません。

poetry.lock CHANGED Viewed

@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
-content-hash = "5b3783fbf4b5b43421199bdac7b259fba589b5be671a18652fc0e305339734c5"

 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9,<3.13"
+content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"

pyproject.toml CHANGED Viewed

@@ -60,6 +60,7 @@ numpy = "<2.0.0"
 pybase64 = "^1.4.0"
 certifi = ">=2024.07.04"
 gradio = { version = "*", optional = true }
 [tool.poetry.extras]
@@ -79,7 +80,7 @@ mdformat-gfm-alerts = "*"
 mdformat-frontmatter = "*"
 [tool.ruff]
-line-length = 119
 [tool.ruff.format]
 line-ending = "lf"
@@ -89,11 +90,12 @@ omit = ["iscc_sct/dev.py", "tests/", "iscc_sct/demo.py"]
 [tool.poe.tasks]
 format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
-format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
 convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
 test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
 update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
-all = ["format-code", "format-markdown", "convert-lf", "test"]
 update = ["update-dependencies", "all"]
 [build-system]

 pybase64 = "^1.4.0"
 certifi = ">=2024.07.04"
 gradio = { version = "*", optional = true }
+pyyaml = "^6.0.2"
 [tool.poetry.extras]
 mdformat-frontmatter = "*"
 [tool.ruff]
+line-length = 100
 [tool.ruff.format]
 line-ending = "lf"
 [tool.poe.tasks]
 format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
+format-markdown = { cmd = "mdformat --wrap 100 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
+format-yml = { script = "iscc_sct.dev:format_yml", help = "Format YML files"}
 convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
 test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
 update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
+all = ["format-code", "format-markdown", "format-yml", "convert-lf", "test"]
 update = ["update-dependencies", "all"]
 [build-system]

tests/benchmark.py CHANGED Viewed

@@ -32,7 +32,9 @@ def benchmark(folder):
         elapsed_time = end_time - start_time
         total_time += elapsed_time
         file_count += 1
-        log.info(f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}")
     if file_count > 0:
         avg_time = total_time / file_count
@@ -45,7 +47,9 @@ def benchmark(folder):
 def main():
     parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
-    parser.add_argument("folder", type=str, help="Directory containing text files for benchmarking.")
     args = parser.parse_args()
     benchmark(args.folder)

         elapsed_time = end_time - start_time
         total_time += elapsed_time
         file_count += 1
+        log.info(
+            f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}"
+        )
     if file_count > 0:
         avg_time = total_time / file_count
 def main():
     parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
+    parser.add_argument(
+        "folder", type=str, help="Directory containing text files for benchmarking."
+    )
     args = parser.parse_args()
     benchmark(args.folder)

tests/test_cli.py CHANGED Viewed

@@ -52,7 +52,9 @@ def test_cli_generate_sct(sample_text_file):
 def test_cli_generate_sct_granular(sample_text_file):
-    result = subprocess.run([sct, str(sample_text_file), "--granular"], capture_output=True, text=True)
     assert result.returncode == 0
     assert "features" in result.stdout

 def test_cli_generate_sct_granular(sample_text_file):
+    result = subprocess.run(
+        [sct, str(sample_text_file), "--granular"], capture_output=True, text=True
+    )
     assert result.returncode == 0
     assert "features" in result.stdout

tests/test_iscc_sct.py CHANGED Viewed

@@ -178,7 +178,9 @@ def test_embed_tokens():
     chunks = ["Hello World", "These are chunks"]
     tokens = tokenize_chunks(chunks)
     embeddings = embed_tokens(tokens)
-    assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)
 def test_embed_chunks():

     chunks = ["Hello World", "These are chunks"]
     tokens = tokenize_chunks(chunks)
     embeddings = embed_tokens(tokens)
+    assert list(embeddings[0][0][:3]) == pytest.approx(
+        [0.05907335, 0.11408358, 0.12727071], rel=1e-2
+    )
 def test_embed_chunks():

tests/test_main.py CHANGED Viewed

@@ -21,7 +21,9 @@ def test_create_granular():
                 "maintype": "semantic",
                 "subtype": "text",
                 "version": 0,
-                "simprints": [{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}],
             }
         ],
     }

                 "maintype": "semantic",
                 "subtype": "text",
                 "version": 0,
+                "simprints": [
+                    {"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}
+                ],
             }
         ],
     }

tests/test_models.py CHANGED Viewed

@@ -22,7 +22,11 @@ def test_feature_initialization():
 def test_feature_set_initialization():
     fs = FeatureSet()
-    assert fs.model_dump(exclude_none=True) == {"maintype": "semantic", "subtype": "text", "version": 0}
 def test_sct_meta_initialization():
@@ -33,7 +37,12 @@ def test_sct_meta_initialization():
     assert meta.features is None
     # Test initialization with all fields
-    features = [FeatureSet(simprints=[Feature(simprint="feature1", offset=0, content="text1")], embedding=[0.1, 0.2])]
     meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
     assert meta.iscc == "ISCC1234567890"
     assert meta.characters == 1000
@@ -67,7 +76,12 @@ def test_metadata_to_index_format():
 def test_metadata_to_object_format():
     # Test conversion from Index-Format to Object-Format
     features = [
-        FeatureSet(simprints=["feature1", "feature2"], offsets=[0, 5], sizes=[5, 5], contents=["text1", "text2"])
     ]
     meta = Metadata(iscc="ISCC1234567890", features=features)
     object_meta = meta.to_object_format()

 def test_feature_set_initialization():
     fs = FeatureSet()
+    assert fs.model_dump(exclude_none=True) == {
+        "maintype": "semantic",
+        "subtype": "text",
+        "version": 0,
+    }
 def test_sct_meta_initialization():
     assert meta.features is None
     # Test initialization with all fields
+    features = [
+        FeatureSet(
+            simprints=[Feature(simprint="feature1", offset=0, content="text1")],
+            embedding=[0.1, 0.2],
+        )
+    ]
     meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
     assert meta.iscc == "ISCC1234567890"
     assert meta.characters == 1000
 def test_metadata_to_object_format():
     # Test conversion from Index-Format to Object-Format
     features = [
+        FeatureSet(
+            simprints=["feature1", "feature2"],
+            offsets=[0, 5],
+            sizes=[5, 5],
+            contents=["text1", "text2"],
+        )
     ]
     meta = Metadata(iscc="ISCC1234567890", features=features)
     object_meta = meta.to_object_format()

tests/visualize.py CHANGED Viewed

@@ -49,7 +49,9 @@ def generate_html(fingerprint_data):
             if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
                 overlap_end = chunks[i + 1]["offset"]
                 html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
-                html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :])
             else:
                 html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])

             if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
                 overlap_end = chunks[i + 1]["offset"]
                 html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
+                html_content += escape_and_preserve_breaks(
+                    chunk["text"][overlap_end - chunk["offset"] :]
+                )
             else:
                 html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])