titusz commited on
Commit
73ab668
1 Parent(s): 63ba62d

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.editorconfig CHANGED
@@ -12,7 +12,7 @@ indent_size = 4
12
  end_of_line = lf
13
  insert_final_newline = true
14
  trim_trailing_whitespace = true
15
- max_line_length = 119
16
 
17
 
18
  # YAML files
 
12
  end_of_line = lf
13
  insert_final_newline = true
14
  trim_trailing_whitespace = true
15
+ max_line_length = 100
16
 
17
 
18
  # YAML files
iscc_sct/cli.py CHANGED
@@ -8,9 +8,15 @@ from charset_normalizer import from_bytes
8
 
9
  def main():
10
  parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
11
- parser.add_argument("path", type=str, help="Path to text files (supports glob patterns).", nargs="?")
12
- parser.add_argument("-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)")
13
- parser.add_argument("-g", "--granular", action="store_true", help="Activate granular processing.")
 
 
 
 
 
 
14
  parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
15
  args = parser.parse_args()
16
 
 
8
 
9
  def main():
10
  parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
11
+ parser.add_argument(
12
+ "path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
13
+ )
14
+ parser.add_argument(
15
+ "-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
16
+ )
17
+ parser.add_argument(
18
+ "-g", "--granular", action="store_true", help="Activate granular processing."
19
+ )
20
  parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
21
  args = parser.parse_args()
22
 
iscc_sct/code_semantic_text.py CHANGED
@@ -233,11 +233,15 @@ def model():
233
  so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
234
  try:
235
  with sct.timer("ONNXMODEL load time"):
236
- return rt.InferenceSession(sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers)
 
 
237
  except NoSuchFile: # pragma: no cover
238
  with sct.timer("ONNXMODEL aquisition/load time"):
239
  model_path = sct.get_model()
240
- return rt.InferenceSession(model_path, sess_options=so, providers=selected_onnx_providers)
 
 
241
 
242
 
243
  def tokenize_chunks(chunks):
 
233
  so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
234
  try:
235
  with sct.timer("ONNXMODEL load time"):
236
+ return rt.InferenceSession(
237
+ sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers
238
+ )
239
  except NoSuchFile: # pragma: no cover
240
  with sct.timer("ONNXMODEL aquisition/load time"):
241
  model_path = sct.get_model()
242
+ return rt.InferenceSession(
243
+ model_path, sess_options=so, providers=selected_onnx_providers
244
+ )
245
 
246
 
247
  def tokenize_chunks(chunks):
iscc_sct/demo.py CHANGED
@@ -6,6 +6,7 @@ from loguru import logger as log
6
  import gradio as gr
7
  import iscc_sct as sct
8
  import textwrap
 
9
 
10
 
11
  newline_symbols = {
@@ -84,7 +85,9 @@ def generate_similarity_bar(similarity):
84
 
85
  # Adjust the text position to be centered within the colored bar
86
  text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
87
- text_alignment = "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
 
 
88
 
89
  bar_html = f"""
90
  <h3>Semantic Similarity</h3>
@@ -97,66 +100,12 @@ def generate_similarity_bar(similarity):
97
  return bar_html
98
 
99
 
100
- # Sample texts
101
- sample_text_en = "\n\n".join(
102
- [
103
- " ".join(paragraph.split())
104
- for paragraph in """
105
- This document specifies the syntax and structure of the International Standard Content Code (ISCC),
106
- as an identification system for digital assets (including encodings of text, images, audio, video or other content
107
- across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
108
- as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
109
-
110
- An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
111
- digests using the algorithms and rules in this document. This document does not provide information on registration of
112
- ISCCs.
113
- """.strip().split("\n\n")
114
- ]
115
- )
116
-
117
- sample_text_de = "\n\n".join(
118
- [
119
- " ".join(paragraph.split())
120
- for paragraph in """
121
- Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
122
- Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
123
- Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
124
- anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
125
-
126
- Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
127
- Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
128
- keine Informationen über die Registrierung von ISCCs.
129
- """.strip().split("\n\n")
130
- ]
131
- )
132
-
133
- sample_text_bg = "\n\n".join(
134
- [
135
- " ".join(paragraph.split())
136
- for paragraph in """
137
- Този документ определя синтаксиса и структурата на Международния стандартен код на съдържанието (ISCC) като система за
138
- идентификация на цифрови активи (включително кодиране на текст, изображения, аудио, видео или друго съдържание във
139
- всички медийни сектори). Той описва също метаданните на ISCC и използването на ISCC във връзка с други схеми, като
140
- DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
141
-
142
- ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор на данни, конструиран от множество
143
- хеш-разходи, като се използват алгоритмите и правилата в настоящия документ. Настоящият документ не предоставя
144
- информация за регистрацията на ISCC.
145
- """.strip().split("\n\n")
146
- ]
147
- )
148
 
149
- sample_text_zh = "\n\n".join(
150
- [
151
- " ".join(paragraph.split())
152
- for paragraph in """
153
- 本文件规定了国际标准内容代码(ISCC)的语法和结构,作为数字资产(包括所有媒 体领域的文本、图像、音频、视频或其他内容的编码)的标识系统。它还介绍了
154
- ISCC 元数据以及 ISCC 与其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的结合使用。
155
 
156
- ISCC 适用于特定的数字资产,是使用本文件中的算法和规则从多个哈希摘要中确定性地建 立起来的数据描述符。本文件不提供有关 ISCC 注册的信息。
157
- """.strip().split("\n\n")
158
- ]
159
- )
160
 
161
  custom_css = """
162
  """
@@ -176,67 +125,81 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
176
  """,
177
  )
178
  with gr.Row(variant="panel"):
179
- in_iscc_bits = gr.Slider(
180
- label="ISCC Bit-Length",
181
- info="NUMBER OF BITS FOR OUTPUT ISCC",
182
- minimum=64,
183
- maximum=256,
184
- step=32,
185
- value=128,
186
- )
 
 
 
 
 
187
  with gr.Row(variant="panel"):
188
  with gr.Column(variant="panel"):
189
  in_text_a = gr.TextArea(
190
  label="Text A",
191
- placeholder="Choose sample text from the dropdown or type or paste your text.",
192
  lines=12,
193
  max_lines=12,
194
  )
195
- sample_dropdown_a = gr.Dropdown(
196
- choices=["None", "English", "Bulgarian"], label="Select sample for Text A", value="None"
197
- )
198
  out_code_a = gr.Textbox(label="ISCC Code for Text A")
199
- out_chunks_a = gr.HighlightedText(
200
- label="Chunked Text A",
201
- interactive=False,
202
- elem_id="chunked-text-a",
203
- )
204
  with gr.Column(variant="panel"):
205
  in_text_b = gr.TextArea(
206
  label="Text B",
207
- placeholder="Choose sample text from the dropdown or type or paste your text.",
208
  lines=12,
209
  max_lines=12,
210
  )
211
- sample_dropdown_b = gr.Dropdown(
212
- choices=["None", "German", "Chinese"], label="Select sample for Text B", value="None"
213
- )
214
  out_code_b = gr.Textbox(label="ISCC Code for Text B")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  out_chunks_b = gr.HighlightedText(
216
  label="Chunked Text B",
217
  interactive=False,
218
  elem_id="chunked-text-b",
219
  )
220
 
221
- def update_sample_text(choice, text_a_or_b):
222
  if choice == "None":
223
  return ""
224
- if text_a_or_b == "A":
225
- return sample_text_en if choice == "English" else sample_text_bg
226
- else:
227
- return sample_text_de if choice == "German" else sample_text_zh
228
 
229
  sample_dropdown_a.change(
230
- lambda choice: update_sample_text(choice, "A"), inputs=[sample_dropdown_a], outputs=[in_text_a]
 
 
231
  )
232
  sample_dropdown_b.change(
233
- lambda choice: update_sample_text(choice, "B"), inputs=[sample_dropdown_b], outputs=[in_text_b]
 
 
234
  )
235
 
236
- with gr.Row(variant="panel"):
237
- with gr.Column(variant="panel"):
238
- out_similarity = gr.HTML(label="Similarity")
239
-
240
  def process_text(text, nbits, suffix):
241
  log.debug(f"{text[:20]}")
242
  out_code_func = globals().get(f"out_code_{suffix}")
@@ -248,7 +211,9 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
248
  out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
249
  }
250
 
251
- result = sct.gen_text_code_semantic(text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True)
 
 
252
  iscc = sct.Metadata(**result).to_object_format()
253
 
254
  # Generate chunked text with simprints and overlaps
@@ -318,14 +283,22 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
318
  show_progress="full",
319
  )
320
 
321
- out_code_a.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
322
- out_code_b.change(compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity])
 
 
 
 
323
 
324
  def reset_all():
325
  return (
326
  gr.Slider(value=128), # Reset ISCC Bit-Length
327
- gr.Dropdown(value="None"), # Reset sample dropdown A
328
- gr.Dropdown(value="None"), # Reset sample dropdown B
 
 
 
 
329
  gr.TextArea(value=""), # Reset Text A
330
  gr.TextArea(value=""), # Reset Text B
331
  gr.Textbox(value=""), # Reset ISCC Code for Text A
 
6
  import gradio as gr
7
  import iscc_sct as sct
8
  import textwrap
9
+ import yaml
10
 
11
 
12
  newline_symbols = {
 
85
 
86
  # Adjust the text position to be centered within the colored bar
87
  text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
88
+ text_alignment = (
89
+ "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
90
+ )
91
 
92
  bar_html = f"""
93
  <h3>Semantic Similarity</h3>
 
100
  return bar_html
101
 
102
 
103
+ def load_samples():
104
+ with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
105
+ return yaml.safe_load(file)["samples"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
 
 
 
 
 
 
107
 
108
+ samples = load_samples()
 
 
 
109
 
110
  custom_css = """
111
  """
 
125
  """,
126
  )
127
  with gr.Row(variant="panel"):
128
+ with gr.Column(variant="panel"):
129
+ sample_dropdown_a = gr.Dropdown(
130
+ choices=["None"] + [lang for lang in samples["a"]],
131
+ label="Select sample for Text A",
132
+ value="None",
133
+ )
134
+ with gr.Column(variant="panel"):
135
+ sample_dropdown_b = gr.Dropdown(
136
+ choices=["None"] + [lang for lang in samples["b"]],
137
+ label="Select sample for Text B",
138
+ value="None",
139
+ )
140
+
141
  with gr.Row(variant="panel"):
142
  with gr.Column(variant="panel"):
143
  in_text_a = gr.TextArea(
144
  label="Text A",
145
+ placeholder="Choose sample text from the dropdown above or type or paste your text.",
146
  lines=12,
147
  max_lines=12,
148
  )
 
 
 
149
  out_code_a = gr.Textbox(label="ISCC Code for Text A")
 
 
 
 
 
150
  with gr.Column(variant="panel"):
151
  in_text_b = gr.TextArea(
152
  label="Text B",
153
+ placeholder="Choose sample text from the dropdown above or type or paste your text.",
154
  lines=12,
155
  max_lines=12,
156
  )
 
 
 
157
  out_code_b = gr.Textbox(label="ISCC Code for Text B")
158
+
159
+ with gr.Row(variant="panel"):
160
+ with gr.Column(variant="panel"):
161
+ out_similarity = gr.HTML(label="Similarity")
162
+
163
+ with gr.Row(variant="panel"):
164
+ in_iscc_bits = gr.Slider(
165
+ label="ISCC Bit-Length",
166
+ info="NUMBER OF BITS FOR OUTPUT ISCC",
167
+ minimum=64,
168
+ maximum=256,
169
+ step=32,
170
+ value=64,
171
+ )
172
+
173
+ with gr.Row(variant="panel"):
174
+ with gr.Column(variant="panel"):
175
+ out_chunks_a = gr.HighlightedText(
176
+ label="Chunked Text A",
177
+ interactive=False,
178
+ elem_id="chunked-text-a",
179
+ )
180
+ with gr.Column(variant="panel"):
181
  out_chunks_b = gr.HighlightedText(
182
  label="Chunked Text B",
183
  interactive=False,
184
  elem_id="chunked-text-b",
185
  )
186
 
187
+ def update_sample_text(choice, group):
188
  if choice == "None":
189
  return ""
190
+ return samples[group][choice]
 
 
 
191
 
192
  sample_dropdown_a.change(
193
+ lambda choice: update_sample_text(choice, "a"),
194
+ inputs=[sample_dropdown_a],
195
+ outputs=[in_text_a],
196
  )
197
  sample_dropdown_b.change(
198
+ lambda choice: update_sample_text(choice, "b"),
199
+ inputs=[sample_dropdown_b],
200
+ outputs=[in_text_b],
201
  )
202
 
 
 
 
 
203
  def process_text(text, nbits, suffix):
204
  log.debug(f"{text[:20]}")
205
  out_code_func = globals().get(f"out_code_{suffix}")
 
211
  out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
212
  }
213
 
214
+ result = sct.gen_text_code_semantic(
215
+ text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
216
+ )
217
  iscc = sct.Metadata(**result).to_object_format()
218
 
219
  # Generate chunked text with simprints and overlaps
 
283
  show_progress="full",
284
  )
285
 
286
+ out_code_a.change(
287
+ compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
288
+ )
289
+ out_code_b.change(
290
+ compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
291
+ )
292
 
293
  def reset_all():
294
  return (
295
  gr.Slider(value=128), # Reset ISCC Bit-Length
296
+ gr.Dropdown(
297
+ value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
298
+ ), # Reset sample dropdown A
299
+ gr.Dropdown(
300
+ value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
301
+ ), # Reset sample dropdown B
302
  gr.TextArea(value=""), # Reset Text A
303
  gr.TextArea(value=""), # Reset Text B
304
  gr.Textbox(value=""), # Reset ISCC Code for Text A
iscc_sct/dev.py CHANGED
@@ -1,4 +1,5 @@
1
  import pathlib
 
2
 
3
 
4
  HERE = pathlib.Path(__file__).parent.absolute()
@@ -20,3 +21,22 @@ def convert_lf(): # pragma: no cover
20
  outfile.write(content)
21
  n += 1
22
  print(f"{n} files converted to LF")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pathlib
2
+ import yaml
3
 
4
 
5
  HERE = pathlib.Path(__file__).parent.absolute()
 
21
  outfile.write(content)
22
  n += 1
23
  print(f"{n} files converted to LF")
24
+
25
+
26
+ def format_yml():
27
+ for f in HERE.glob("**\*.yml"):
28
+ with open(f, "rt", encoding="utf-8") as infile:
29
+ data = yaml.safe_load(infile)
30
+ with open(f, "wt", encoding="utf-8", newline="\n") as outf:
31
+ yaml.safe_dump(
32
+ data,
33
+ outf,
34
+ indent=2,
35
+ width=80,
36
+ encoding="utf-8",
37
+ sort_keys=False,
38
+ default_flow_style=False,
39
+ default_style=">",
40
+ allow_unicode=True,
41
+ line_break="\n",
42
+ )
iscc_sct/models.py CHANGED
@@ -82,7 +82,9 @@ class PrettyBaseModel(BaseModel):
82
  return self.pretty_repr()
83
 
84
  def pretty_repr(self):
85
- return self.model_dump_json(indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False)
 
 
86
 
87
 
88
  class Feature(PrettyBaseModel):
@@ -132,9 +134,15 @@ class Metadata(PrettyBaseModel):
132
  new_features.append(new_feature_set)
133
  else:
134
  new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
135
- new_feature_set.offsets = [f.offset for f in feature_set.simprints if f.offset is not None]
136
- new_feature_set.sizes = [f.size for f in feature_set.simprints if f.size is not None]
137
- new_feature_set.contents = [f.content for f in feature_set.simprints if f.content is not None]
 
 
 
 
 
 
138
  new_features.append(new_feature_set)
139
 
140
  return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
@@ -154,7 +162,9 @@ class Metadata(PrettyBaseModel):
154
  # Convert to object format if in index format
155
  feature_set = self.to_object_format().features[0]
156
 
157
- if not all(feature.content and feature.offset is not None for feature in feature_set.simprints):
 
 
158
  return None
159
 
160
  # Sort features by offset
@@ -191,7 +201,9 @@ class Metadata(PrettyBaseModel):
191
  # Convert to object format if in index format
192
  feature_set = self.to_object_format().features[0]
193
 
194
- if not all(feature.content and feature.offset is not None for feature in feature_set.simprints):
 
 
195
  return []
196
 
197
  # Sort features by offset
 
82
  return self.pretty_repr()
83
 
84
  def pretty_repr(self):
85
+ return self.model_dump_json(
86
+ indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False
87
+ )
88
 
89
 
90
  class Feature(PrettyBaseModel):
 
134
  new_features.append(new_feature_set)
135
  else:
136
  new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
137
+ new_feature_set.offsets = [
138
+ f.offset for f in feature_set.simprints if f.offset is not None
139
+ ]
140
+ new_feature_set.sizes = [
141
+ f.size for f in feature_set.simprints if f.size is not None
142
+ ]
143
+ new_feature_set.contents = [
144
+ f.content for f in feature_set.simprints if f.content is not None
145
+ ]
146
  new_features.append(new_feature_set)
147
 
148
  return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
 
162
  # Convert to object format if in index format
163
  feature_set = self.to_object_format().features[0]
164
 
165
+ if not all(
166
+ feature.content and feature.offset is not None for feature in feature_set.simprints
167
+ ):
168
  return None
169
 
170
  # Sort features by offset
 
201
  # Convert to object format if in index format
202
  feature_set = self.to_object_format().features[0]
203
 
204
+ if not all(
205
+ feature.content and feature.offset is not None for feature in feature_set.simprints
206
+ ):
207
  return []
208
 
209
  # Sort features by offset
iscc_sct/options.py CHANGED
@@ -29,15 +29,27 @@ class SctOptions(BaseSettings):
29
  multiple_of=32,
30
  )
31
 
32
- characters: bool = Field(True, description="ISCC_SCT_CHARACTERS - Include document character count")
33
- embedding: bool = Field(False, description="ISCC_SCT_EMBEDDING - Include global document embedding")
 
 
 
 
34
 
35
- precision: int = Field(8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)")
 
 
36
 
37
- simprints: bool = Field(False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints")
38
- offsets: bool = Field(False, description="ISCC_SCT_OFFSETS - Include offsets of granular features")
 
 
 
 
39
 
40
- sizes: bool = Field(False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)")
 
 
41
 
42
  contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
43
 
@@ -52,7 +64,9 @@ class SctOptions(BaseSettings):
52
  description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
53
  )
54
 
55
- trim: bool = Field(False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)")
 
 
56
 
57
  model_config = SettingsConfigDict(
58
  env_file=".env",
 
29
  multiple_of=32,
30
  )
31
 
32
+ characters: bool = Field(
33
+ True, description="ISCC_SCT_CHARACTERS - Include document character count"
34
+ )
35
+ embedding: bool = Field(
36
+ False, description="ISCC_SCT_EMBEDDING - Include global document embedding"
37
+ )
38
 
39
+ precision: int = Field(
40
+ 8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)"
41
+ )
42
 
43
+ simprints: bool = Field(
44
+ False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints"
45
+ )
46
+ offsets: bool = Field(
47
+ False, description="ISCC_SCT_OFFSETS - Include offsets of granular features"
48
+ )
49
 
50
+ sizes: bool = Field(
51
+ False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)"
52
+ )
53
 
54
  contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
55
 
 
64
  description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
65
  )
66
 
67
+ trim: bool = Field(
68
+ False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)"
69
+ )
70
 
71
  model_config = SettingsConfigDict(
72
  env_file=".env",
iscc_sct/samples.yml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "samples":
2
+ "a":
3
+ "English": >
4
+ This document specifies the syntax and structure of the International Standard
5
+ Content Code (ISCC), as an identification system for digital assets (including
6
+ encodings of text, images, audio, video or other content across all media sectors).
7
+ It also describes ISCC metadata and the use of ISCC in conjunction with other
8
+ schemes, such as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
9
+
10
+
11
+ An ISCC applies to a specific digital asset and is a data-descriptor deterministically
12
+ constructed from multiple hash digests using the algorithms and rules in this
13
+ document. This document does not provide information on registration of ISCCs.
14
+ "Hungarian": >
15
+ Ez a dokumentum meghatározza a Nemzetközi Szabványos Tartalomkód (ISCC) szintaxisát
16
+ és szerkezetét, amely a digitális eszközök azonosítási rendszere (beleértve
17
+ a szövegek, képek, hang-, videó- vagy egyéb tartalmak kódolását minden médiaszektorban).
18
+ Leírja továbbá az ISCC metaadatokat és az ISCC használatát más sémákkal, mint
19
+ például a DOI, ISAN, ISBN, ISRC, ISSN és ISWC.
20
+
21
+
22
+ Az ISCC egy adott digitális eszközre vonatkozik, és több hash kivonatból determinisztikusan
23
+ összeállított adatleíró a jelen dokumentumban szereplő algoritmusok és szabályok
24
+ alapján. Ez a dokumentum nem ad tájékoztatást az ISCC-k regisztrációjáról.
25
+ "Bulgarian": >
26
+ Този документ определя синтаксиса и структурата на Международния стандартен
27
+ код на съдържанието (ISCC) като система за идентификация на цифрови активи (включително
28
+ кодиране на текст, изображения, аудио, видео или друго съдържание във всички
29
+ медийни сектори). Той описва също метаданните на ISCC и използването на ISCC
30
+ във връзка с други схеми, като DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
31
+
32
+
33
+ ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор
34
+ на данни, конструиран от множество хеш-разходи, като се използват алгоритмите
35
+ и правилата в настоящия документ. Настоящият документ не предоставя информация
36
+ за регистрацията на ISCC.
37
+ "Finnish": >
38
+ Tässä asiakirjassa määritellään ISCC:n (International Standard Content Code)
39
+ syntaksi ja rakenne digitaalisen omaisuuden tunnistusjärjestelmänä (mukaan lukien
40
+ tekstin, kuvien, äänen, videon tai muun sisällön koodaukset kaikilla mediasektoreilla).
41
+ Siinä kuvataan myös ISCC-metatiedot ja ISCC:n käyttö muiden järjestelmien, kuten
42
+ DOI, ISAN, ISBN, ISRC, ISSN ja ISWC, kanssa.
43
+
44
+
45
+ ISCC koskee tiettyä digitaalista omaisuutta, ja se on datakuvaaja, joka on deterministisesti
46
+ muodostettu useista hajautuskoosteista käyttämällä tämän asiakirjan algoritmeja
47
+ ja sääntöjä. Tämä asiakirja ei sisällä tietoja ISCC:iden rekisteröinnistä.
48
+ "Arabic": >
49
+ تحدد هذه الوثيقة بناء جملة وبنية كود المحتوى القياسي الدولي (ISCC)، كنظام تعريف
50
+ للأصول الرقمية (بما في ذلك ترميز النصوص أو الصور أو الصوت أو الفيديو أو أي محتوى
51
+ آخر عبر جميع قطاعات الوسائط). ويصف أيضًا بيانات تعريف ISCC واستخدام ISCC بالتزامن
52
+ مع مخططات أخرى، مثل DOI وISAN وISBN وISRC وISSN وISWC.
53
+
54
+
55
+ ينطبق ISCC على أصل رقمي محدد وهو عبارة عن واصف بيانات تم إنشاؤه بشكل حتمي من
56
+ ملخصات تجزئة متعددة باستخدام الخوارزميات والقواعد الواردة في هذه الوثيقة. لا
57
+ تقدم هذه الوثيقة معلومات عن تسجيل ISCCs.
58
+ "b":
59
+ "Paraphrased": >
60
+ In order to identify digital assets (such as encodings of text, images, music,
61
+ video, and other content across all media sectors), this paper outlines the
62
+ syntax and structure of the International Standard Content Code (ISCC). Additionally,
63
+ it explains how to use ISCC metadata and how to combine it with other schemes
64
+ like DOI, ISAN, ISBN, ISRC, ISSN, and ISWC.
65
+
66
+
67
+ An ISCC is a data-descriptor that is applied to a particular digital asset and
68
+ is deterministically built from several hash digests utilizing the rules and
69
+ techniques in this specification. There is no information about ISCC registration
70
+ in this paper.
71
+ "German": >
72
+ Dieses Dokument spezifiziert die Syntax und Struktur des International Standard
73
+ Content Code (ISCC) als Identifizierungssystem für digitale Inhalte (einschließlich
74
+ Kodierungen von Text, Bildern, Audio, Video oder anderen Inhalten in allen Medienbereichen).
75
+ Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung
76
+ mit anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
77
+
78
+
79
+ Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor,
80
+ der deterministisch aus mehreren Hash-Digests unter Verwendung der Algorithmen
81
+ und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält keine Informationen
82
+ über die Registrierung von ISCCs.
83
+ "Chinese": >
84
+ 本文件規定了國際標準內容編碼 (ISCC) 的語法和結構,作為數位資產 (包括所有媒體領域的文字、影像、音訊、視訊或其他內容的編碼) 的識別系統。它還介紹了
85
+ ISCC 元資料以及 ISCC 與其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的結合使用。
86
+
87
+
88
+ ISCC 適用於特定的數位資產,是使用本文件中的演算法和規則,由多個雜湊摘要(hash digests)確定地建構出來的資料描述符(data-descriptor)。本文件不提供
89
+ ISCC 的註冊資訊。
90
+ "Korean": >
91
+ 이 문서는 디지털 자산(모든 미디어 부문의 텍스트, 이미지, 오디오, 비디오 또는 기타 콘텐츠의 인코딩 포함)에 대한 식별 시스템인 ISCC(국제
92
+ 표준 콘텐츠 코드)의 구문과 구조를 지정합니다. 또한 ISCC 메타데이터와 DOI, ISAN, ISBN, ISRC, ISSN 및 ISWC와
93
+ 같은 다른 체계와 함께 ISCC를 사용하는 방법에 대해 설명합니다.
94
+
95
+
96
+ ISCC는 특정 디지털 자산에 적용되며 이 문서의 알고리즘과 규칙을 사용하여 여러 해시 다이제스트에서 결정론적으로 구성된 데이터 설명자입니다.
97
+ 이 문서는 ISCC 등록에 대한 정보를 제공하지 않습니다.
98
+ "Japanese": >
99
+ この文書は、デジタル資産 (すべてのメディア セクターにわたるテキスト、画像、オーディオ、ビデオ、またはその他のコンテンツのエンコードを含む) の識別システムとして、国際標準コンテンツ
100
+ コード (ISCC) の構文と構造を指定します。また、ISCC メタデータと、DOI、ISAN、ISBN、ISRC、ISSN、ISWC などの他のスキームと組み合わせた
101
+ ISCC の使用についても説明します。
102
+
103
+
104
+ ISCC は特定のデジタル資産に適用され、本書のアルゴリズムとルールを使用して複数のハッシュ ダイジェストから決定論的に構築されるデータ記述子です。この文書には、ISCC
105
+ の登録に関する情報は記載されていません。
poetry.lock CHANGED
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
2796
  [metadata]
2797
  lock-version = "2.0"
2798
  python-versions = ">=3.9,<3.13"
2799
- content-hash = "5b3783fbf4b5b43421199bdac7b259fba589b5be671a18652fc0e305339734c5"
 
2796
  [metadata]
2797
  lock-version = "2.0"
2798
  python-versions = ">=3.9,<3.13"
2799
+ content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"
pyproject.toml CHANGED
@@ -60,6 +60,7 @@ numpy = "<2.0.0"
60
  pybase64 = "^1.4.0"
61
  certifi = ">=2024.07.04"
62
  gradio = { version = "*", optional = true }
 
63
 
64
 
65
  [tool.poetry.extras]
@@ -79,7 +80,7 @@ mdformat-gfm-alerts = "*"
79
  mdformat-frontmatter = "*"
80
 
81
  [tool.ruff]
82
- line-length = 119
83
 
84
  [tool.ruff.format]
85
  line-ending = "lf"
@@ -89,11 +90,12 @@ omit = ["iscc_sct/dev.py", "tests/", "iscc_sct/demo.py"]
89
 
90
  [tool.poe.tasks]
91
  format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
92
- format-markdown = { cmd = "mdformat --wrap 119 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
 
93
  convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
94
  test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
95
  update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
96
- all = ["format-code", "format-markdown", "convert-lf", "test"]
97
  update = ["update-dependencies", "all"]
98
 
99
  [build-system]
 
60
  pybase64 = "^1.4.0"
61
  certifi = ">=2024.07.04"
62
  gradio = { version = "*", optional = true }
63
+ pyyaml = "^6.0.2"
64
 
65
 
66
  [tool.poetry.extras]
 
80
  mdformat-frontmatter = "*"
81
 
82
  [tool.ruff]
83
+ line-length = 100
84
 
85
  [tool.ruff.format]
86
  line-ending = "lf"
 
90
 
91
  [tool.poe.tasks]
92
  format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
93
+ format-markdown = { cmd = "mdformat --wrap 100 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
94
+ format-yml = { script = "iscc_sct.dev:format_yml", help = "Format YML files"}
95
  convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
96
  test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
97
  update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
98
+ all = ["format-code", "format-markdown", "format-yml", "convert-lf", "test"]
99
  update = ["update-dependencies", "all"]
100
 
101
  [build-system]
tests/benchmark.py CHANGED
@@ -32,7 +32,9 @@ def benchmark(folder):
32
  elapsed_time = end_time - start_time
33
  total_time += elapsed_time
34
  file_count += 1
35
- log.info(f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}")
 
 
36
 
37
  if file_count > 0:
38
  avg_time = total_time / file_count
@@ -45,7 +47,9 @@ def benchmark(folder):
45
 
46
  def main():
47
  parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
48
- parser.add_argument("folder", type=str, help="Directory containing text files for benchmarking.")
 
 
49
  args = parser.parse_args()
50
 
51
  benchmark(args.folder)
 
32
  elapsed_time = end_time - start_time
33
  total_time += elapsed_time
34
  file_count += 1
35
+ log.info(
36
+ f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}"
37
+ )
38
 
39
  if file_count > 0:
40
  avg_time = total_time / file_count
 
47
 
48
  def main():
49
  parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
50
+ parser.add_argument(
51
+ "folder", type=str, help="Directory containing text files for benchmarking."
52
+ )
53
  args = parser.parse_args()
54
 
55
  benchmark(args.folder)
tests/test_cli.py CHANGED
@@ -52,7 +52,9 @@ def test_cli_generate_sct(sample_text_file):
52
 
53
 
54
  def test_cli_generate_sct_granular(sample_text_file):
55
- result = subprocess.run([sct, str(sample_text_file), "--granular"], capture_output=True, text=True)
 
 
56
  assert result.returncode == 0
57
  assert "features" in result.stdout
58
 
 
52
 
53
 
54
  def test_cli_generate_sct_granular(sample_text_file):
55
+ result = subprocess.run(
56
+ [sct, str(sample_text_file), "--granular"], capture_output=True, text=True
57
+ )
58
  assert result.returncode == 0
59
  assert "features" in result.stdout
60
 
tests/test_iscc_sct.py CHANGED
@@ -178,7 +178,9 @@ def test_embed_tokens():
178
  chunks = ["Hello World", "These are chunks"]
179
  tokens = tokenize_chunks(chunks)
180
  embeddings = embed_tokens(tokens)
181
- assert list(embeddings[0][0][:3]) == pytest.approx([0.05907335, 0.11408358, 0.12727071], rel=1e-2)
 
 
182
 
183
 
184
  def test_embed_chunks():
 
178
  chunks = ["Hello World", "These are chunks"]
179
  tokens = tokenize_chunks(chunks)
180
  embeddings = embed_tokens(tokens)
181
+ assert list(embeddings[0][0][:3]) == pytest.approx(
182
+ [0.05907335, 0.11408358, 0.12727071], rel=1e-2
183
+ )
184
 
185
 
186
  def test_embed_chunks():
tests/test_main.py CHANGED
@@ -21,7 +21,9 @@ def test_create_granular():
21
  "maintype": "semantic",
22
  "subtype": "text",
23
  "version": 0,
24
- "simprints": [{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}],
 
 
25
  }
26
  ],
27
  }
 
21
  "maintype": "semantic",
22
  "subtype": "text",
23
  "version": 0,
24
+ "simprints": [
25
+ {"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}
26
+ ],
27
  }
28
  ],
29
  }
tests/test_models.py CHANGED
@@ -22,7 +22,11 @@ def test_feature_initialization():
22
 
23
  def test_feature_set_initialization():
24
  fs = FeatureSet()
25
- assert fs.model_dump(exclude_none=True) == {"maintype": "semantic", "subtype": "text", "version": 0}
 
 
 
 
26
 
27
 
28
  def test_sct_meta_initialization():
@@ -33,7 +37,12 @@ def test_sct_meta_initialization():
33
  assert meta.features is None
34
 
35
  # Test initialization with all fields
36
- features = [FeatureSet(simprints=[Feature(simprint="feature1", offset=0, content="text1")], embedding=[0.1, 0.2])]
 
 
 
 
 
37
  meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
38
  assert meta.iscc == "ISCC1234567890"
39
  assert meta.characters == 1000
@@ -67,7 +76,12 @@ def test_metadata_to_index_format():
67
  def test_metadata_to_object_format():
68
  # Test conversion from Index-Format to Object-Format
69
  features = [
70
- FeatureSet(simprints=["feature1", "feature2"], offsets=[0, 5], sizes=[5, 5], contents=["text1", "text2"])
 
 
 
 
 
71
  ]
72
  meta = Metadata(iscc="ISCC1234567890", features=features)
73
  object_meta = meta.to_object_format()
 
22
 
23
  def test_feature_set_initialization():
24
  fs = FeatureSet()
25
+ assert fs.model_dump(exclude_none=True) == {
26
+ "maintype": "semantic",
27
+ "subtype": "text",
28
+ "version": 0,
29
+ }
30
 
31
 
32
  def test_sct_meta_initialization():
 
37
  assert meta.features is None
38
 
39
  # Test initialization with all fields
40
+ features = [
41
+ FeatureSet(
42
+ simprints=[Feature(simprint="feature1", offset=0, content="text1")],
43
+ embedding=[0.1, 0.2],
44
+ )
45
+ ]
46
  meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
47
  assert meta.iscc == "ISCC1234567890"
48
  assert meta.characters == 1000
 
76
  def test_metadata_to_object_format():
77
  # Test conversion from Index-Format to Object-Format
78
  features = [
79
+ FeatureSet(
80
+ simprints=["feature1", "feature2"],
81
+ offsets=[0, 5],
82
+ sizes=[5, 5],
83
+ contents=["text1", "text2"],
84
+ )
85
  ]
86
  meta = Metadata(iscc="ISCC1234567890", features=features)
87
  object_meta = meta.to_object_format()
tests/visualize.py CHANGED
@@ -49,7 +49,9 @@ def generate_html(fingerprint_data):
49
  if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
50
  overlap_end = chunks[i + 1]["offset"]
51
  html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
52
- html_content += escape_and_preserve_breaks(chunk["text"][overlap_end - chunk["offset"] :])
 
 
53
  else:
54
  html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
55
 
 
49
  if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
50
  overlap_end = chunks[i + 1]["offset"]
51
  html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
52
+ html_content += escape_and_preserve_breaks(
53
+ chunk["text"][overlap_end - chunk["offset"] :]
54
+ )
55
  else:
56
  html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
57