Synced repo using 'sync_with_huggingface' Github Action
Browse files- .editorconfig +1 -1
- iscc_sct/cli.py +9 -3
- iscc_sct/code_semantic_text.py +6 -2
- iscc_sct/demo.py +69 -96
- iscc_sct/dev.py +20 -0
- iscc_sct/models.py +18 -6
- iscc_sct/options.py +21 -7
- iscc_sct/samples.yml +105 -0
- poetry.lock +1 -1
- pyproject.toml +5 -3
- tests/benchmark.py +6 -2
- tests/test_cli.py +3 -1
- tests/test_iscc_sct.py +3 -1
- tests/test_main.py +3 -1
- tests/test_models.py +17 -3
- tests/visualize.py +3 -1
.editorconfig
CHANGED
@@ -12,7 +12,7 @@ indent_size = 4
|
|
12 |
end_of_line = lf
|
13 |
insert_final_newline = true
|
14 |
trim_trailing_whitespace = true
|
15 |
-
max_line_length =
|
16 |
|
17 |
|
18 |
# YAML files
|
|
|
12 |
end_of_line = lf
|
13 |
insert_final_newline = true
|
14 |
trim_trailing_whitespace = true
|
15 |
+
max_line_length = 100
|
16 |
|
17 |
|
18 |
# YAML files
|
iscc_sct/cli.py
CHANGED
@@ -8,9 +8,15 @@ from charset_normalizer import from_bytes
|
|
8 |
|
9 |
def main():
|
10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
11 |
-
parser.add_argument(
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
|
15 |
args = parser.parse_args()
|
16 |
|
|
|
8 |
|
9 |
def main():
|
10 |
parser = argparse.ArgumentParser(description="Generate Semantic Text-Codes for text files.")
|
11 |
+
parser.add_argument(
|
12 |
+
"path", type=str, help="Path to text files (supports glob patterns).", nargs="?"
|
13 |
+
)
|
14 |
+
parser.add_argument(
|
15 |
+
"-b", "--bits", type=int, default=256, help="Bit-Length of Code (default 256)"
|
16 |
+
)
|
17 |
+
parser.add_argument(
|
18 |
+
"-g", "--granular", action="store_true", help="Activate granular processing."
|
19 |
+
)
|
20 |
parser.add_argument("-d", "--debug", action="store_true", help="Show debugging messages.")
|
21 |
args = parser.parse_args()
|
22 |
|
iscc_sct/code_semantic_text.py
CHANGED
@@ -233,11 +233,15 @@ def model():
|
|
233 |
so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
234 |
try:
|
235 |
with sct.timer("ONNXMODEL load time"):
|
236 |
-
return rt.InferenceSession(
|
|
|
|
|
237 |
except NoSuchFile: # pragma: no cover
|
238 |
with sct.timer("ONNXMODEL aquisition/load time"):
|
239 |
model_path = sct.get_model()
|
240 |
-
return rt.InferenceSession(
|
|
|
|
|
241 |
|
242 |
|
243 |
def tokenize_chunks(chunks):
|
|
|
233 |
so.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL
|
234 |
try:
|
235 |
with sct.timer("ONNXMODEL load time"):
|
236 |
+
return rt.InferenceSession(
|
237 |
+
sct.MODEL_PATH, sess_options=so, providers=selected_onnx_providers
|
238 |
+
)
|
239 |
except NoSuchFile: # pragma: no cover
|
240 |
with sct.timer("ONNXMODEL aquisition/load time"):
|
241 |
model_path = sct.get_model()
|
242 |
+
return rt.InferenceSession(
|
243 |
+
model_path, sess_options=so, providers=selected_onnx_providers
|
244 |
+
)
|
245 |
|
246 |
|
247 |
def tokenize_chunks(chunks):
|
iscc_sct/demo.py
CHANGED
@@ -6,6 +6,7 @@ from loguru import logger as log
|
|
6 |
import gradio as gr
|
7 |
import iscc_sct as sct
|
8 |
import textwrap
|
|
|
9 |
|
10 |
|
11 |
newline_symbols = {
|
@@ -84,7 +85,9 @@ def generate_similarity_bar(similarity):
|
|
84 |
|
85 |
# Adjust the text position to be centered within the colored bar
|
86 |
text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
|
87 |
-
text_alignment =
|
|
|
|
|
88 |
|
89 |
bar_html = f"""
|
90 |
<h3>Semantic Similarity</h3>
|
@@ -97,66 +100,12 @@ def generate_similarity_bar(similarity):
|
|
97 |
return bar_html
|
98 |
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
" ".join(paragraph.split())
|
104 |
-
for paragraph in """
|
105 |
-
This document specifies the syntax and structure of the International Standard Content Code (ISCC),
|
106 |
-
as an identification system for digital assets (including encodings of text, images, audio, video or other content
|
107 |
-
across all media sectors). It also describes ISCC metadata and the use of ISCC in conjunction with other schemes, such
|
108 |
-
as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
|
109 |
-
|
110 |
-
An ISCC applies to a specific digital asset and is a data-descriptor deterministically constructed from multiple hash
|
111 |
-
digests using the algorithms and rules in this document. This document does not provide information on registration of
|
112 |
-
ISCCs.
|
113 |
-
""".strip().split("\n\n")
|
114 |
-
]
|
115 |
-
)
|
116 |
-
|
117 |
-
sample_text_de = "\n\n".join(
|
118 |
-
[
|
119 |
-
" ".join(paragraph.split())
|
120 |
-
for paragraph in """
|
121 |
-
Dieses Dokument spezifiziert die Syntax und Struktur des International Standard Content Code (ISCC) als
|
122 |
-
Identifizierungssystem für digitale Inhalte (einschließlich Kodierungen von Text, Bildern, Audio, Video oder anderen
|
123 |
-
Inhalten in allen Medienbereichen). Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung mit
|
124 |
-
anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
|
125 |
-
|
126 |
-
Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor, der deterministisch aus mehreren
|
127 |
-
Hash-Digests unter Verwendung der Algorithmen und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält
|
128 |
-
keine Informationen über die Registrierung von ISCCs.
|
129 |
-
""".strip().split("\n\n")
|
130 |
-
]
|
131 |
-
)
|
132 |
-
|
133 |
-
sample_text_bg = "\n\n".join(
|
134 |
-
[
|
135 |
-
" ".join(paragraph.split())
|
136 |
-
for paragraph in """
|
137 |
-
Този документ определя синтаксиса и структурата на Международния стандартен код на съдържанието (ISCC) като система за
|
138 |
-
идентификация на цифрови активи (включително кодиране на текст, изображения, аудио, видео или друго съдържание във
|
139 |
-
всички медийни сектори). Той описва също метаданните на ISCC и използването на ISCC във връзка с други схеми, като
|
140 |
-
DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
|
141 |
-
|
142 |
-
ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор на данни, конструиран от множество
|
143 |
-
хеш-разходи, като се използват алгоритмите и правилата в настоящия документ. Настоящият документ не предоставя
|
144 |
-
информация за регистрацията на ISCC.
|
145 |
-
""".strip().split("\n\n")
|
146 |
-
]
|
147 |
-
)
|
148 |
|
149 |
-
sample_text_zh = "\n\n".join(
|
150 |
-
[
|
151 |
-
" ".join(paragraph.split())
|
152 |
-
for paragraph in """
|
153 |
-
本文件规定了国际标准内容代码(ISCC)的语法和结构,作为数字资产(包括所有媒 体领域的文本、图像、音频、视频或其他内容的编码)的标识系统。它还介绍了
|
154 |
-
ISCC 元数据以及 ISCC 与其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的结合使用。
|
155 |
|
156 |
-
|
157 |
-
""".strip().split("\n\n")
|
158 |
-
]
|
159 |
-
)
|
160 |
|
161 |
custom_css = """
|
162 |
"""
|
@@ -176,67 +125,81 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
176 |
""",
|
177 |
)
|
178 |
with gr.Row(variant="panel"):
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
187 |
with gr.Row(variant="panel"):
|
188 |
with gr.Column(variant="panel"):
|
189 |
in_text_a = gr.TextArea(
|
190 |
label="Text A",
|
191 |
-
placeholder="Choose sample text from the dropdown or type or paste your text.",
|
192 |
lines=12,
|
193 |
max_lines=12,
|
194 |
)
|
195 |
-
sample_dropdown_a = gr.Dropdown(
|
196 |
-
choices=["None", "English", "Bulgarian"], label="Select sample for Text A", value="None"
|
197 |
-
)
|
198 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
199 |
-
out_chunks_a = gr.HighlightedText(
|
200 |
-
label="Chunked Text A",
|
201 |
-
interactive=False,
|
202 |
-
elem_id="chunked-text-a",
|
203 |
-
)
|
204 |
with gr.Column(variant="panel"):
|
205 |
in_text_b = gr.TextArea(
|
206 |
label="Text B",
|
207 |
-
placeholder="Choose sample text from the dropdown or type or paste your text.",
|
208 |
lines=12,
|
209 |
max_lines=12,
|
210 |
)
|
211 |
-
sample_dropdown_b = gr.Dropdown(
|
212 |
-
choices=["None", "German", "Chinese"], label="Select sample for Text B", value="None"
|
213 |
-
)
|
214 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
out_chunks_b = gr.HighlightedText(
|
216 |
label="Chunked Text B",
|
217 |
interactive=False,
|
218 |
elem_id="chunked-text-b",
|
219 |
)
|
220 |
|
221 |
-
def update_sample_text(choice,
|
222 |
if choice == "None":
|
223 |
return ""
|
224 |
-
|
225 |
-
return sample_text_en if choice == "English" else sample_text_bg
|
226 |
-
else:
|
227 |
-
return sample_text_de if choice == "German" else sample_text_zh
|
228 |
|
229 |
sample_dropdown_a.change(
|
230 |
-
lambda choice: update_sample_text(choice, "
|
|
|
|
|
231 |
)
|
232 |
sample_dropdown_b.change(
|
233 |
-
lambda choice: update_sample_text(choice, "
|
|
|
|
|
234 |
)
|
235 |
|
236 |
-
with gr.Row(variant="panel"):
|
237 |
-
with gr.Column(variant="panel"):
|
238 |
-
out_similarity = gr.HTML(label="Similarity")
|
239 |
-
|
240 |
def process_text(text, nbits, suffix):
|
241 |
log.debug(f"{text[:20]}")
|
242 |
out_code_func = globals().get(f"out_code_{suffix}")
|
@@ -248,7 +211,9 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
248 |
out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
|
249 |
}
|
250 |
|
251 |
-
result = sct.gen_text_code_semantic(
|
|
|
|
|
252 |
iscc = sct.Metadata(**result).to_object_format()
|
253 |
|
254 |
# Generate chunked text with simprints and overlaps
|
@@ -318,14 +283,22 @@ with gr.Blocks(css=custom_css, theme=iscc_theme) as demo:
|
|
318 |
show_progress="full",
|
319 |
)
|
320 |
|
321 |
-
out_code_a.change(
|
322 |
-
|
|
|
|
|
|
|
|
|
323 |
|
324 |
def reset_all():
|
325 |
return (
|
326 |
gr.Slider(value=128), # Reset ISCC Bit-Length
|
327 |
-
gr.Dropdown(
|
328 |
-
|
|
|
|
|
|
|
|
|
329 |
gr.TextArea(value=""), # Reset Text A
|
330 |
gr.TextArea(value=""), # Reset Text B
|
331 |
gr.Textbox(value=""), # Reset ISCC Code for Text A
|
|
|
6 |
import gradio as gr
|
7 |
import iscc_sct as sct
|
8 |
import textwrap
|
9 |
+
import yaml
|
10 |
|
11 |
|
12 |
newline_symbols = {
|
|
|
85 |
|
86 |
# Adjust the text position to be centered within the colored bar
|
87 |
text_position = "left: 50%;" if similarity >= 0 else "right: 50%;"
|
88 |
+
text_alignment = (
|
89 |
+
"transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);"
|
90 |
+
)
|
91 |
|
92 |
bar_html = f"""
|
93 |
<h3>Semantic Similarity</h3>
|
|
|
100 |
return bar_html
|
101 |
|
102 |
|
103 |
+
def load_samples():
|
104 |
+
with open("iscc_sct/samples.yml", "r", encoding="utf-8") as file:
|
105 |
+
return yaml.safe_load(file)["samples"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
+
samples = load_samples()
|
|
|
|
|
|
|
109 |
|
110 |
custom_css = """
|
111 |
"""
|
|
|
125 |
""",
|
126 |
)
|
127 |
with gr.Row(variant="panel"):
|
128 |
+
with gr.Column(variant="panel"):
|
129 |
+
sample_dropdown_a = gr.Dropdown(
|
130 |
+
choices=["None"] + [lang for lang in samples["a"]],
|
131 |
+
label="Select sample for Text A",
|
132 |
+
value="None",
|
133 |
+
)
|
134 |
+
with gr.Column(variant="panel"):
|
135 |
+
sample_dropdown_b = gr.Dropdown(
|
136 |
+
choices=["None"] + [lang for lang in samples["b"]],
|
137 |
+
label="Select sample for Text B",
|
138 |
+
value="None",
|
139 |
+
)
|
140 |
+
|
141 |
with gr.Row(variant="panel"):
|
142 |
with gr.Column(variant="panel"):
|
143 |
in_text_a = gr.TextArea(
|
144 |
label="Text A",
|
145 |
+
placeholder="Choose sample text from the dropdown above or type or paste your text.",
|
146 |
lines=12,
|
147 |
max_lines=12,
|
148 |
)
|
|
|
|
|
|
|
149 |
out_code_a = gr.Textbox(label="ISCC Code for Text A")
|
|
|
|
|
|
|
|
|
|
|
150 |
with gr.Column(variant="panel"):
|
151 |
in_text_b = gr.TextArea(
|
152 |
label="Text B",
|
153 |
+
placeholder="Choose sample text from the dropdown above or type or paste your text.",
|
154 |
lines=12,
|
155 |
max_lines=12,
|
156 |
)
|
|
|
|
|
|
|
157 |
out_code_b = gr.Textbox(label="ISCC Code for Text B")
|
158 |
+
|
159 |
+
with gr.Row(variant="panel"):
|
160 |
+
with gr.Column(variant="panel"):
|
161 |
+
out_similarity = gr.HTML(label="Similarity")
|
162 |
+
|
163 |
+
with gr.Row(variant="panel"):
|
164 |
+
in_iscc_bits = gr.Slider(
|
165 |
+
label="ISCC Bit-Length",
|
166 |
+
info="NUMBER OF BITS FOR OUTPUT ISCC",
|
167 |
+
minimum=64,
|
168 |
+
maximum=256,
|
169 |
+
step=32,
|
170 |
+
value=64,
|
171 |
+
)
|
172 |
+
|
173 |
+
with gr.Row(variant="panel"):
|
174 |
+
with gr.Column(variant="panel"):
|
175 |
+
out_chunks_a = gr.HighlightedText(
|
176 |
+
label="Chunked Text A",
|
177 |
+
interactive=False,
|
178 |
+
elem_id="chunked-text-a",
|
179 |
+
)
|
180 |
+
with gr.Column(variant="panel"):
|
181 |
out_chunks_b = gr.HighlightedText(
|
182 |
label="Chunked Text B",
|
183 |
interactive=False,
|
184 |
elem_id="chunked-text-b",
|
185 |
)
|
186 |
|
187 |
+
def update_sample_text(choice, group):
|
188 |
if choice == "None":
|
189 |
return ""
|
190 |
+
return samples[group][choice]
|
|
|
|
|
|
|
191 |
|
192 |
sample_dropdown_a.change(
|
193 |
+
lambda choice: update_sample_text(choice, "a"),
|
194 |
+
inputs=[sample_dropdown_a],
|
195 |
+
outputs=[in_text_a],
|
196 |
)
|
197 |
sample_dropdown_b.change(
|
198 |
+
lambda choice: update_sample_text(choice, "b"),
|
199 |
+
inputs=[sample_dropdown_b],
|
200 |
+
outputs=[in_text_b],
|
201 |
)
|
202 |
|
|
|
|
|
|
|
|
|
203 |
def process_text(text, nbits, suffix):
|
204 |
log.debug(f"{text[:20]}")
|
205 |
out_code_func = globals().get(f"out_code_{suffix}")
|
|
|
211 |
out_chunks_func: gr.HighlightedText(value=None, elem_id="chunked-text"),
|
212 |
}
|
213 |
|
214 |
+
result = sct.gen_text_code_semantic(
|
215 |
+
text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True
|
216 |
+
)
|
217 |
iscc = sct.Metadata(**result).to_object_format()
|
218 |
|
219 |
# Generate chunked text with simprints and overlaps
|
|
|
283 |
show_progress="full",
|
284 |
)
|
285 |
|
286 |
+
out_code_a.change(
|
287 |
+
compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
|
288 |
+
)
|
289 |
+
out_code_b.change(
|
290 |
+
compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity]
|
291 |
+
)
|
292 |
|
293 |
def reset_all():
|
294 |
return (
|
295 |
gr.Slider(value=128), # Reset ISCC Bit-Length
|
296 |
+
gr.Dropdown(
|
297 |
+
value="None", choices=["None"] + [f"a:{lang}" for lang in samples["a"]]
|
298 |
+
), # Reset sample dropdown A
|
299 |
+
gr.Dropdown(
|
300 |
+
value="None", choices=["None"] + [f"b:{lang}" for lang in samples["b"]]
|
301 |
+
), # Reset sample dropdown B
|
302 |
gr.TextArea(value=""), # Reset Text A
|
303 |
gr.TextArea(value=""), # Reset Text B
|
304 |
gr.Textbox(value=""), # Reset ISCC Code for Text A
|
iscc_sct/dev.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import pathlib
|
|
|
2 |
|
3 |
|
4 |
HERE = pathlib.Path(__file__).parent.absolute()
|
@@ -20,3 +21,22 @@ def convert_lf(): # pragma: no cover
|
|
20 |
outfile.write(content)
|
21 |
n += 1
|
22 |
print(f"{n} files converted to LF")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pathlib
|
2 |
+
import yaml
|
3 |
|
4 |
|
5 |
HERE = pathlib.Path(__file__).parent.absolute()
|
|
|
21 |
outfile.write(content)
|
22 |
n += 1
|
23 |
print(f"{n} files converted to LF")
|
24 |
+
|
25 |
+
|
26 |
+
def format_yml():
|
27 |
+
for f in HERE.glob("**\*.yml"):
|
28 |
+
with open(f, "rt", encoding="utf-8") as infile:
|
29 |
+
data = yaml.safe_load(infile)
|
30 |
+
with open(f, "wt", encoding="utf-8", newline="\n") as outf:
|
31 |
+
yaml.safe_dump(
|
32 |
+
data,
|
33 |
+
outf,
|
34 |
+
indent=2,
|
35 |
+
width=80,
|
36 |
+
encoding="utf-8",
|
37 |
+
sort_keys=False,
|
38 |
+
default_flow_style=False,
|
39 |
+
default_style=">",
|
40 |
+
allow_unicode=True,
|
41 |
+
line_break="\n",
|
42 |
+
)
|
iscc_sct/models.py
CHANGED
@@ -82,7 +82,9 @@ class PrettyBaseModel(BaseModel):
|
|
82 |
return self.pretty_repr()
|
83 |
|
84 |
def pretty_repr(self):
|
85 |
-
return self.model_dump_json(
|
|
|
|
|
86 |
|
87 |
|
88 |
class Feature(PrettyBaseModel):
|
@@ -132,9 +134,15 @@ class Metadata(PrettyBaseModel):
|
|
132 |
new_features.append(new_feature_set)
|
133 |
else:
|
134 |
new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
|
135 |
-
new_feature_set.offsets = [
|
136 |
-
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
new_features.append(new_feature_set)
|
139 |
|
140 |
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
|
@@ -154,7 +162,9 @@ class Metadata(PrettyBaseModel):
|
|
154 |
# Convert to object format if in index format
|
155 |
feature_set = self.to_object_format().features[0]
|
156 |
|
157 |
-
if not all(
|
|
|
|
|
158 |
return None
|
159 |
|
160 |
# Sort features by offset
|
@@ -191,7 +201,9 @@ class Metadata(PrettyBaseModel):
|
|
191 |
# Convert to object format if in index format
|
192 |
feature_set = self.to_object_format().features[0]
|
193 |
|
194 |
-
if not all(
|
|
|
|
|
195 |
return []
|
196 |
|
197 |
# Sort features by offset
|
|
|
82 |
return self.pretty_repr()
|
83 |
|
84 |
def pretty_repr(self):
|
85 |
+
return self.model_dump_json(
|
86 |
+
indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False
|
87 |
+
)
|
88 |
|
89 |
|
90 |
class Feature(PrettyBaseModel):
|
|
|
134 |
new_features.append(new_feature_set)
|
135 |
else:
|
136 |
new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
|
137 |
+
new_feature_set.offsets = [
|
138 |
+
f.offset for f in feature_set.simprints if f.offset is not None
|
139 |
+
]
|
140 |
+
new_feature_set.sizes = [
|
141 |
+
f.size for f in feature_set.simprints if f.size is not None
|
142 |
+
]
|
143 |
+
new_feature_set.contents = [
|
144 |
+
f.content for f in feature_set.simprints if f.content is not None
|
145 |
+
]
|
146 |
new_features.append(new_feature_set)
|
147 |
|
148 |
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
|
|
|
162 |
# Convert to object format if in index format
|
163 |
feature_set = self.to_object_format().features[0]
|
164 |
|
165 |
+
if not all(
|
166 |
+
feature.content and feature.offset is not None for feature in feature_set.simprints
|
167 |
+
):
|
168 |
return None
|
169 |
|
170 |
# Sort features by offset
|
|
|
201 |
# Convert to object format if in index format
|
202 |
feature_set = self.to_object_format().features[0]
|
203 |
|
204 |
+
if not all(
|
205 |
+
feature.content and feature.offset is not None for feature in feature_set.simprints
|
206 |
+
):
|
207 |
return []
|
208 |
|
209 |
# Sort features by offset
|
iscc_sct/options.py
CHANGED
@@ -29,15 +29,27 @@ class SctOptions(BaseSettings):
|
|
29 |
multiple_of=32,
|
30 |
)
|
31 |
|
32 |
-
characters: bool = Field(
|
33 |
-
|
|
|
|
|
|
|
|
|
34 |
|
35 |
-
precision: int = Field(
|
|
|
|
|
36 |
|
37 |
-
simprints: bool = Field(
|
38 |
-
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
sizes: bool = Field(
|
|
|
|
|
41 |
|
42 |
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
|
43 |
|
@@ -52,7 +64,9 @@ class SctOptions(BaseSettings):
|
|
52 |
description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
|
53 |
)
|
54 |
|
55 |
-
trim: bool = Field(
|
|
|
|
|
56 |
|
57 |
model_config = SettingsConfigDict(
|
58 |
env_file=".env",
|
|
|
29 |
multiple_of=32,
|
30 |
)
|
31 |
|
32 |
+
characters: bool = Field(
|
33 |
+
True, description="ISCC_SCT_CHARACTERS - Include document character count"
|
34 |
+
)
|
35 |
+
embedding: bool = Field(
|
36 |
+
False, description="ISCC_SCT_EMBEDDING - Include global document embedding"
|
37 |
+
)
|
38 |
|
39 |
+
precision: int = Field(
|
40 |
+
8, description="ISCC_SCT_PRECISION - Max fractional digits for embeddings (default 8)"
|
41 |
+
)
|
42 |
|
43 |
+
simprints: bool = Field(
|
44 |
+
False, description="ISCC_SCT_SIMPRINTS - Include granular feature simprints"
|
45 |
+
)
|
46 |
+
offsets: bool = Field(
|
47 |
+
False, description="ISCC_SCT_OFFSETS - Include offsets of granular features"
|
48 |
+
)
|
49 |
|
50 |
+
sizes: bool = Field(
|
51 |
+
False, description="ISCC_SCT_SIZES - Include sizes of granular features (number of chars)"
|
52 |
+
)
|
53 |
|
54 |
contents: bool = Field(False, description="ISCC_SCT_CONTENTS - Include granular text chunks")
|
55 |
|
|
|
64 |
description="ISCC_SCT_OVERLAP - Max tokens allowed to overlap between chunks (Default 48)",
|
65 |
)
|
66 |
|
67 |
+
trim: bool = Field(
|
68 |
+
False, description="ISCC_SCT_TRIM - Trim whitespace from chunks (Default False)"
|
69 |
+
)
|
70 |
|
71 |
model_config = SettingsConfigDict(
|
72 |
env_file=".env",
|
iscc_sct/samples.yml
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"samples":
|
2 |
+
"a":
|
3 |
+
"English": >
|
4 |
+
This document specifies the syntax and structure of the International Standard
|
5 |
+
Content Code (ISCC), as an identification system for digital assets (including
|
6 |
+
encodings of text, images, audio, video or other content across all media sectors).
|
7 |
+
It also describes ISCC metadata and the use of ISCC in conjunction with other
|
8 |
+
schemes, such as DOI, ISAN, ISBN, ISRC, ISSN and ISWC.
|
9 |
+
|
10 |
+
|
11 |
+
An ISCC applies to a specific digital asset and is a data-descriptor deterministically
|
12 |
+
constructed from multiple hash digests using the algorithms and rules in this
|
13 |
+
document. This document does not provide information on registration of ISCCs.
|
14 |
+
"Hungarian": >
|
15 |
+
Ez a dokumentum meghatározza a Nemzetközi Szabványos Tartalomkód (ISCC) szintaxisát
|
16 |
+
és szerkezetét, amely a digitális eszközök azonosítási rendszere (beleértve
|
17 |
+
a szövegek, képek, hang-, videó- vagy egyéb tartalmak kódolását minden médiaszektorban).
|
18 |
+
Leírja továbbá az ISCC metaadatokat és az ISCC használatát más sémákkal, mint
|
19 |
+
például a DOI, ISAN, ISBN, ISRC, ISSN és ISWC.
|
20 |
+
|
21 |
+
|
22 |
+
Az ISCC egy adott digitális eszközre vonatkozik, és több hash kivonatból determinisztikusan
|
23 |
+
összeállított adatleíró a jelen dokumentumban szereplő algoritmusok és szabályok
|
24 |
+
alapján. Ez a dokumentum nem ad tájékoztatást az ISCC-k regisztrációjáról.
|
25 |
+
"Bulgarian": >
|
26 |
+
Този документ определя синтаксиса и структурата на Международния стандартен
|
27 |
+
код на съдържанието (ISCC) като система за идентификация на цифрови активи (включително
|
28 |
+
кодиране на текст, изображения, аудио, видео или друго съдържание във всички
|
29 |
+
медийни сектори). Той описва също метаданните на ISCC и използването на ISCC
|
30 |
+
във връзка с други схеми, като DOI, ISAN, ISBN, ISRC, ISSN и ISWC.
|
31 |
+
|
32 |
+
|
33 |
+
ISCC се прилага за конкретен цифров актив и представлява детерминиран дескриптор
|
34 |
+
на данни, конструиран от множество хеш-разходи, като се използват алгоритмите
|
35 |
+
и правилата в настоящия документ. Настоящият документ не предоставя информация
|
36 |
+
за регистрацията на ISCC.
|
37 |
+
"Finnish": >
|
38 |
+
Tässä asiakirjassa määritellään ISCC:n (International Standard Content Code)
|
39 |
+
syntaksi ja rakenne digitaalisen omaisuuden tunnistusjärjestelmänä (mukaan lukien
|
40 |
+
tekstin, kuvien, äänen, videon tai muun sisällön koodaukset kaikilla mediasektoreilla).
|
41 |
+
Siinä kuvataan myös ISCC-metatiedot ja ISCC:n käyttö muiden järjestelmien, kuten
|
42 |
+
DOI, ISAN, ISBN, ISRC, ISSN ja ISWC, kanssa.
|
43 |
+
|
44 |
+
|
45 |
+
ISCC koskee tiettyä digitaalista omaisuutta, ja se on datakuvaaja, joka on deterministisesti
|
46 |
+
muodostettu useista hajautuskoosteista käyttämällä tämän asiakirjan algoritmeja
|
47 |
+
ja sääntöjä. Tämä asiakirja ei sisällä tietoja ISCC:iden rekisteröinnistä.
|
48 |
+
"Arabic": >
|
49 |
+
تحدد هذه الوثيقة بناء جملة وبنية كود المحتوى القياسي الدولي (ISCC)، كنظام تعريف
|
50 |
+
للأصول الرقمية (بما في ذلك ترميز النصوص أو الصور أو الصوت أو الفيديو أو أي محتوى
|
51 |
+
آخر عبر جميع قطاعات الوسائط). ويصف أيضًا بيانات تعريف ISCC واستخدام ISCC بالتزامن
|
52 |
+
مع مخططات أخرى، مثل DOI وISAN وISBN وISRC وISSN وISWC.
|
53 |
+
|
54 |
+
|
55 |
+
ينطبق ISCC على أصل رقمي محدد وهو عبارة عن واصف بيانات تم إنشاؤه بشكل حتمي من
|
56 |
+
ملخصات تجزئة متعددة باستخدام الخوارزميات والقواعد الواردة في هذه الوثيقة. لا
|
57 |
+
تقدم هذه الوثيقة معلومات عن تسجيل ISCCs.
|
58 |
+
"b":
|
59 |
+
"Paraphrased": >
|
60 |
+
In order to identify digital assets (such as encodings of text, images, music,
|
61 |
+
video, and other content across all media sectors), this paper outlines the
|
62 |
+
syntax and structure of the International Standard Content Code (ISCC). Additionally,
|
63 |
+
it explains how to use ISCC metadata and how to combine it with other schemes
|
64 |
+
like DOI, ISAN, ISBN, ISRC, ISSN, and ISWC.
|
65 |
+
|
66 |
+
|
67 |
+
An ISCC is a data-descriptor that is applied to a particular digital asset and
|
68 |
+
is deterministically built from several hash digests utilizing the rules and
|
69 |
+
techniques in this specification. There is no information about ISCC registration
|
70 |
+
in this paper.
|
71 |
+
"German": >
|
72 |
+
Dieses Dokument spezifiziert die Syntax und Struktur des International Standard
|
73 |
+
Content Code (ISCC) als Identifizierungssystem für digitale Inhalte (einschließlich
|
74 |
+
Kodierungen von Text, Bildern, Audio, Video oder anderen Inhalten in allen Medienbereichen).
|
75 |
+
Sie beschreibt auch ISCC-Metadaten und die Verwendung von ISCC in Verbindung
|
76 |
+
mit anderen Systemen wie DOI, ISAN, ISBN, ISRC, ISSN und ISWC.
|
77 |
+
|
78 |
+
|
79 |
+
Ein ISCC bezieht sich auf ein bestimmtes digitales Gut und ist ein Daten-Deskriptor,
|
80 |
+
der deterministisch aus mehreren Hash-Digests unter Verwendung der Algorithmen
|
81 |
+
und Regeln in diesem Dokument erstellt wird. Dieses Dokument enthält keine Informationen
|
82 |
+
über die Registrierung von ISCCs.
|
83 |
+
"Chinese": >
|
84 |
+
本文件規定了國際標準內容編碼 (ISCC) 的語法和結構,作為數位資產 (包括所有媒體領域的文字、影像、音訊、視訊或其他內容的編碼) 的識別系統。它還介紹了
|
85 |
+
ISCC 元資料以及 ISCC 與其他方案(如 DOI、ISAN、ISBN、ISRC、ISSN 和 ISWC)的結合使用。
|
86 |
+
|
87 |
+
|
88 |
+
ISCC 適用於特定的數位資產,是使用本文件中的演算法和規則,由多個雜湊摘要(hash digests)確定地建構出來的資料描述符(data-descriptor)。本文件不提供
|
89 |
+
ISCC 的註冊資訊。
|
90 |
+
"Korean": >
|
91 |
+
이 문서는 디지털 자산(모든 미디어 부문의 텍스트, 이미지, 오디오, 비디오 또는 기타 콘텐츠의 인코딩 포함)에 대한 식별 시스템인 ISCC(국제
|
92 |
+
표준 콘텐츠 코드)의 구문과 구조를 지정합니다. 또한 ISCC 메타데이터와 DOI, ISAN, ISBN, ISRC, ISSN 및 ISWC와
|
93 |
+
같은 다른 체계와 함께 ISCC를 사용하는 방법에 대해 설명합니다.
|
94 |
+
|
95 |
+
|
96 |
+
ISCC는 특정 디지털 자산에 적용되며 이 문서의 알고리즘과 규칙을 사용하여 여러 해시 다이제스트에서 결정론적으로 구성된 데이터 설명자입니다.
|
97 |
+
이 문서는 ISCC 등록에 대한 정보를 제공하지 않습니다.
|
98 |
+
"Japanese": >
|
99 |
+
この文書は、デジタル資産 (すべてのメディア セクターにわたるテキスト、画像、オーディオ、ビデオ、またはその他のコンテンツのエンコードを含む) の識別システムとして、国際標準コンテンツ
|
100 |
+
コード (ISCC) の構文と構造を指定します。また、ISCC メタデータと、DOI、ISAN、ISBN、ISRC、ISSN、ISWC などの他のスキームと組み合わせた
|
101 |
+
ISCC の使用についても説明します。
|
102 |
+
|
103 |
+
|
104 |
+
ISCC は特定のデジタル資産に適用され、本書のアルゴリズムとルールを使用して複数のハッシュ ダイジェストから決定論的に構築されるデータ記述子です。この文書には、ISCC
|
105 |
+
の登録に関する情報は記載されていません。
|
poetry.lock
CHANGED
@@ -2796,4 +2796,4 @@ gpu = ["onnxruntime-gpu"]
|
|
2796 |
[metadata]
|
2797 |
lock-version = "2.0"
|
2798 |
python-versions = ">=3.9,<3.13"
|
2799 |
-
content-hash = "
|
|
|
2796 |
[metadata]
|
2797 |
lock-version = "2.0"
|
2798 |
python-versions = ">=3.9,<3.13"
|
2799 |
+
content-hash = "e4a4f012af4c1e60326f792c8801857dbf9298d8992fdd83d3b8f0688d4c04ea"
|
pyproject.toml
CHANGED
@@ -60,6 +60,7 @@ numpy = "<2.0.0"
|
|
60 |
pybase64 = "^1.4.0"
|
61 |
certifi = ">=2024.07.04"
|
62 |
gradio = { version = "*", optional = true }
|
|
|
63 |
|
64 |
|
65 |
[tool.poetry.extras]
|
@@ -79,7 +80,7 @@ mdformat-gfm-alerts = "*"
|
|
79 |
mdformat-frontmatter = "*"
|
80 |
|
81 |
[tool.ruff]
|
82 |
-
line-length =
|
83 |
|
84 |
[tool.ruff.format]
|
85 |
line-ending = "lf"
|
@@ -89,11 +90,12 @@ omit = ["iscc_sct/dev.py", "tests/", "iscc_sct/demo.py"]
|
|
89 |
|
90 |
[tool.poe.tasks]
|
91 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
92 |
-
format-markdown = { cmd = "mdformat --wrap
|
|
|
93 |
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
94 |
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
95 |
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
96 |
-
all = ["format-code", "format-markdown", "convert-lf", "test"]
|
97 |
update = ["update-dependencies", "all"]
|
98 |
|
99 |
[build-system]
|
|
|
60 |
pybase64 = "^1.4.0"
|
61 |
certifi = ">=2024.07.04"
|
62 |
gradio = { version = "*", optional = true }
|
63 |
+
pyyaml = "^6.0.2"
|
64 |
|
65 |
|
66 |
[tool.poetry.extras]
|
|
|
80 |
mdformat-frontmatter = "*"
|
81 |
|
82 |
[tool.ruff]
|
83 |
+
line-length = 100
|
84 |
|
85 |
[tool.ruff.format]
|
86 |
line-ending = "lf"
|
|
|
90 |
|
91 |
[tool.poe.tasks]
|
92 |
format-code = { cmd = "ruff format", help = "Code style formating with ruff" }
|
93 |
+
format-markdown = { cmd = "mdformat --wrap 100 --end-of-line lf README.md", help = "Markdown formating with mdformat" }
|
94 |
+
format-yml = { script = "iscc_sct.dev:format_yml", help = "Format YML files"}
|
95 |
convert-lf = { script = "iscc_sct.dev:convert_lf", help = "Convert line endings to LF"}
|
96 |
test = { cmd = "pytest --cov=iscc_sct --cov-fail-under=100", help = "Run tests with coverage" }
|
97 |
update-dependencies = { cmd = "poetry update", help = "Update dependencies" }
|
98 |
+
all = ["format-code", "format-markdown", "format-yml", "convert-lf", "test"]
|
99 |
update = ["update-dependencies", "all"]
|
100 |
|
101 |
[build-system]
|
tests/benchmark.py
CHANGED
@@ -32,7 +32,9 @@ def benchmark(folder):
|
|
32 |
elapsed_time = end_time - start_time
|
33 |
total_time += elapsed_time
|
34 |
file_count += 1
|
35 |
-
log.info(
|
|
|
|
|
36 |
|
37 |
if file_count > 0:
|
38 |
avg_time = total_time / file_count
|
@@ -45,7 +47,9 @@ def benchmark(folder):
|
|
45 |
|
46 |
def main():
|
47 |
parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
|
48 |
-
parser.add_argument(
|
|
|
|
|
49 |
args = parser.parse_args()
|
50 |
|
51 |
benchmark(args.folder)
|
|
|
32 |
elapsed_time = end_time - start_time
|
33 |
total_time += elapsed_time
|
34 |
file_count += 1
|
35 |
+
log.info(
|
36 |
+
f"Processed {txt_path.name} in {elapsed_time:.2f} seconds. ISCC: {iscc_meta['iscc']}"
|
37 |
+
)
|
38 |
|
39 |
if file_count > 0:
|
40 |
avg_time = total_time / file_count
|
|
|
47 |
|
48 |
def main():
|
49 |
parser = argparse.ArgumentParser(description="Benchmark ISCC Semantic-Code Text generation.")
|
50 |
+
parser.add_argument(
|
51 |
+
"folder", type=str, help="Directory containing text files for benchmarking."
|
52 |
+
)
|
53 |
args = parser.parse_args()
|
54 |
|
55 |
benchmark(args.folder)
|
tests/test_cli.py
CHANGED
@@ -52,7 +52,9 @@ def test_cli_generate_sct(sample_text_file):
|
|
52 |
|
53 |
|
54 |
def test_cli_generate_sct_granular(sample_text_file):
|
55 |
-
result = subprocess.run(
|
|
|
|
|
56 |
assert result.returncode == 0
|
57 |
assert "features" in result.stdout
|
58 |
|
|
|
52 |
|
53 |
|
54 |
def test_cli_generate_sct_granular(sample_text_file):
|
55 |
+
result = subprocess.run(
|
56 |
+
[sct, str(sample_text_file), "--granular"], capture_output=True, text=True
|
57 |
+
)
|
58 |
assert result.returncode == 0
|
59 |
assert "features" in result.stdout
|
60 |
|
tests/test_iscc_sct.py
CHANGED
@@ -178,7 +178,9 @@ def test_embed_tokens():
|
|
178 |
chunks = ["Hello World", "These are chunks"]
|
179 |
tokens = tokenize_chunks(chunks)
|
180 |
embeddings = embed_tokens(tokens)
|
181 |
-
assert list(embeddings[0][0][:3]) == pytest.approx(
|
|
|
|
|
182 |
|
183 |
|
184 |
def test_embed_chunks():
|
|
|
178 |
chunks = ["Hello World", "These are chunks"]
|
179 |
tokens = tokenize_chunks(chunks)
|
180 |
embeddings = embed_tokens(tokens)
|
181 |
+
assert list(embeddings[0][0][:3]) == pytest.approx(
|
182 |
+
[0.05907335, 0.11408358, 0.12727071], rel=1e-2
|
183 |
+
)
|
184 |
|
185 |
|
186 |
def test_embed_chunks():
|
tests/test_main.py
CHANGED
@@ -21,7 +21,9 @@ def test_create_granular():
|
|
21 |
"maintype": "semantic",
|
22 |
"subtype": "text",
|
23 |
"version": 0,
|
24 |
-
"simprints": [
|
|
|
|
|
25 |
}
|
26 |
],
|
27 |
}
|
|
|
21 |
"maintype": "semantic",
|
22 |
"subtype": "text",
|
23 |
"version": 0,
|
24 |
+
"simprints": [
|
25 |
+
{"content": "Hello World", "offset": 0, "simprint": "82eJ2NG741E", "size": 11}
|
26 |
+
],
|
27 |
}
|
28 |
],
|
29 |
}
|
tests/test_models.py
CHANGED
@@ -22,7 +22,11 @@ def test_feature_initialization():
|
|
22 |
|
23 |
def test_feature_set_initialization():
|
24 |
fs = FeatureSet()
|
25 |
-
assert fs.model_dump(exclude_none=True) == {
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
def test_sct_meta_initialization():
|
@@ -33,7 +37,12 @@ def test_sct_meta_initialization():
|
|
33 |
assert meta.features is None
|
34 |
|
35 |
# Test initialization with all fields
|
36 |
-
features = [
|
|
|
|
|
|
|
|
|
|
|
37 |
meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
|
38 |
assert meta.iscc == "ISCC1234567890"
|
39 |
assert meta.characters == 1000
|
@@ -67,7 +76,12 @@ def test_metadata_to_index_format():
|
|
67 |
def test_metadata_to_object_format():
|
68 |
# Test conversion from Index-Format to Object-Format
|
69 |
features = [
|
70 |
-
FeatureSet(
|
|
|
|
|
|
|
|
|
|
|
71 |
]
|
72 |
meta = Metadata(iscc="ISCC1234567890", features=features)
|
73 |
object_meta = meta.to_object_format()
|
|
|
22 |
|
23 |
def test_feature_set_initialization():
|
24 |
fs = FeatureSet()
|
25 |
+
assert fs.model_dump(exclude_none=True) == {
|
26 |
+
"maintype": "semantic",
|
27 |
+
"subtype": "text",
|
28 |
+
"version": 0,
|
29 |
+
}
|
30 |
|
31 |
|
32 |
def test_sct_meta_initialization():
|
|
|
37 |
assert meta.features is None
|
38 |
|
39 |
# Test initialization with all fields
|
40 |
+
features = [
|
41 |
+
FeatureSet(
|
42 |
+
simprints=[Feature(simprint="feature1", offset=0, content="text1")],
|
43 |
+
embedding=[0.1, 0.2],
|
44 |
+
)
|
45 |
+
]
|
46 |
meta = Metadata(iscc="ISCC1234567890", characters=1000, features=features)
|
47 |
assert meta.iscc == "ISCC1234567890"
|
48 |
assert meta.characters == 1000
|
|
|
76 |
def test_metadata_to_object_format():
|
77 |
# Test conversion from Index-Format to Object-Format
|
78 |
features = [
|
79 |
+
FeatureSet(
|
80 |
+
simprints=["feature1", "feature2"],
|
81 |
+
offsets=[0, 5],
|
82 |
+
sizes=[5, 5],
|
83 |
+
contents=["text1", "text2"],
|
84 |
+
)
|
85 |
]
|
86 |
meta = Metadata(iscc="ISCC1234567890", features=features)
|
87 |
object_meta = meta.to_object_format()
|
tests/visualize.py
CHANGED
@@ -49,7 +49,9 @@ def generate_html(fingerprint_data):
|
|
49 |
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
|
50 |
overlap_end = chunks[i + 1]["offset"]
|
51 |
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
|
52 |
-
html_content += escape_and_preserve_breaks(
|
|
|
|
|
53 |
else:
|
54 |
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
|
55 |
|
|
|
49 |
if i < len(chunks) - 1 and end > chunks[i + 1]["offset"]:
|
50 |
overlap_end = chunks[i + 1]["offset"]
|
51 |
html_content += f'<span class="{chunk_color}">{escape_and_preserve_breaks(chunk["text"][start - chunk["offset"]:overlap_end - chunk["offset"]])}</span>'
|
52 |
+
html_content += escape_and_preserve_breaks(
|
53 |
+
chunk["text"][overlap_end - chunk["offset"] :]
|
54 |
+
)
|
55 |
else:
|
56 |
html_content += escape_and_preserve_breaks(chunk["text"][start - chunk["offset"] :])
|
57 |
|