""" Gradio demo showcasing ISCC Semantic Text Code. """ from loguru import logger as log import gradio as gr import iscc_sct as sct import textwrap import yaml import pathlib HERE = pathlib.Path(__file__).parent.absolute() custom_css = """ .simbar { background: white; min-height: 30px; } """ newline_symbols = { "\u000a": "⏎", # Line Feed - Represented by the 'Return' symbol "\u000b": "↨", # Vertical Tab - Represented by the 'Up Down Arrow' symbol "\u000c": "␌", # Form Feed - Unicode Control Pictures representation "\u000d": "↵", # Carriage Return - 'Downwards Arrow with Corner Leftwards' symbol "\u0085": "⤓", # Next Line - 'Downwards Arrow with Double Stroke' symbol "\u2028": "↲", # Line Separator - 'Downwards Arrow with Tip Leftwards' symbol "\u2029": "¶", # Paragraph Separator - Represented by the 'Pilcrow' symbol } def no_nl(text): """Replace non-printable newline characters with printable symbols""" for char, symbol in newline_symbols.items(): text = text.replace(char, symbol) return text def no_nl_inner(text): """Replace non-printable newline characters with printable symbols, ignoring leading and trailing newlines""" # Strip leading and trailing whitespace stripped_text = text.strip() # Replace newline characters within the text for char, symbol in newline_symbols.items(): stripped_text = stripped_text.replace(char, symbol) # Add back the leading and trailing newlines leading_newlines = len(text) - len(text.lstrip()) trailing_newlines = len(text) - len(text.rstrip()) return "\n" * leading_newlines + stripped_text + "\n" * trailing_newlines def clean_chunk(chunk): """Strip consecutive line breaks in text to a maximum of 2.""" return chunk.replace("\n\n", "\n") def compute_iscc_code(text1, text2, bit_length): code1 = sct.gen_text_code_semantic(text1, bits=bit_length) code2 = sct.gen_text_code_semantic(text2, bits=bit_length) similarity = compare_codes(code1["iscc"], code2["iscc"], bit_length) return code1["iscc"], code2["iscc"], similarity import binascii def compare_codes(code_a, code_b, bits): if code_a and code_b: code_a_str = code_a.value if hasattr(code_a, "value") else str(code_a) code_b_str = code_b.value if hasattr(code_b, "value") else str(code_b) if code_a_str and code_b_str: try: distance = sct.iscc_distance(code_a_str, code_b_str) return generate_similarity_bar(hamming_to_cosine(distance, bits)) except binascii.Error: # Invalid ISCC code format return None return None def truncate_text(text, max_length=70): return textwrap.shorten(text, width=max_length, placeholder="...") def hamming_to_cosine(hamming_distance: int, dim: int) -> float: """Aproximate the cosine similarity for a given hamming distance and dimension""" result = 1 - (2 * hamming_distance) / dim return result def generate_similarity_bar(similarity): """Generate a horizontal bar representing the similarity value, scaled to -100% to +100%.""" # Scale similarity from [-1, 1] to [-100, 100] display_similarity = similarity * 100 # Calculate the width of the bar based on the absolute value of similarity bar_width = int(abs(similarity) * 50) # 50% is half the width of the container # Determine the color and starting position based on the sign of the similarity color = "green" if similarity >= 0 else "red" position = "left" if similarity >= 0 else "right" # Adjust the text position to be centered within the colored bar text_position = "left: 50%;" if similarity >= 0 else "right: 50%;" text_alignment = ( "transform: translateX(-50%);" if similarity >= 0 else "transform: translateX(50%);" ) tooltip = "Similarity based on ISCC code comparison, not direct text comparison." bar_html = f"""
{display_similarity:.2f}%
""" return bar_html def load_samples(): with open(HERE / "samples.yml", "r", encoding="utf-8") as file: return yaml.safe_load(file)["samples"] samples = load_samples() iscc_theme = gr.themes.Default( font=[gr.themes.GoogleFont("Readex Pro Light")], font_mono=[gr.themes.GoogleFont("JetBrains Mono")], text_size=gr.themes.sizes.text_lg, radius_size=gr.themes.sizes.radius_none, ) with gr.Blocks(css=custom_css, theme=iscc_theme) as demo: with gr.Row(variant="panel"): gr.Markdown( """ ## 🔮️ ISCC - Semantic-Code Text Demo of cross-lingual Semantic Text-Code (proof of concept) """, ) with gr.Row(variant="panel"): with gr.Column(variant="panel"): sample_dropdown_a = gr.Dropdown( choices=["None"] + [lang for lang in samples["a"]], label="Select sample for Text A", value="None", ) with gr.Column(variant="panel"): sample_dropdown_b = gr.Dropdown( choices=["None"] + [lang for lang in samples["b"]], label="Select sample for Text B", value="None", ) with gr.Row(variant="panel"): with gr.Column(variant="panel"): in_text_a = gr.TextArea( label="Text A", placeholder="Choose sample text from the dropdown above or type or paste your text.", lines=12, max_lines=12, ) out_code_a = gr.Textbox(label="ISCC-SCT for Text A") with gr.Column(variant="panel"): in_text_b = gr.TextArea( label="Text B", placeholder="Choose sample text from the dropdown above or type or paste your text.", lines=12, max_lines=12, ) out_code_b = gr.Textbox(label="ISCC-SCT for Text B") with gr.Row(variant="panel"): with gr.Column(variant="panel"): out_similarity_title = gr.Markdown("### ISCC-based Semantic Similarity") with gr.Row(elem_classes="simbar"): out_similarity = gr.HTML() gr.Markdown( "**NOTE:** Similarity is calculated based on the generated ISCC-SCT, not the original text." ) with gr.Row(variant="panel"): reset_button = gr.Button("Reset All") with gr.Accordion(label="🔍 Explore Details & Advanced Options", open=True): with gr.Row(variant="panel"): with gr.Column(variant="panel"): in_iscc_bits = gr.Slider( label="ISCC Bit-Length", info="NUMBER OF BITS FOR OUTPUT ISCC", minimum=64, maximum=256, step=32, value=sct.sct_opts.bits, ) with gr.Column(variant="panel"): in_max_tokens = gr.Slider( label="Max Tokens", info="MAXIMUM NUMBER OF TOKENS PER CHUNK", minimum=49, maximum=sct.sct_opts.max_tokens, step=1, value=127, ) with gr.Row(variant="panel"): with gr.Column(variant="panel"): out_chunks_a = gr.HighlightedText( label="Chunked Text A", interactive=False, elem_id="chunked-text-a", ) with gr.Column(variant="panel"): out_chunks_b = gr.HighlightedText( label="Chunked Text B", interactive=False, elem_id="chunked-text-b", ) with gr.Row(variant="panel"): with gr.Column(variant="panel"): gr.Markdown("### Granular Matches") in_granular_matches = gr.Dataframe( headers=["Chunk A", "Similarity", "Chunk B"], column_widths=["45%", "10%", "45%"], wrap=True, elem_classes="granular-matches", ) def update_sample_text(choice, group): if choice == "None": return "" return samples[group][choice] sample_dropdown_a.change( lambda choice: update_sample_text(choice, "a"), inputs=[sample_dropdown_a], outputs=[in_text_a], ) sample_dropdown_b.change( lambda choice: update_sample_text(choice, "b"), inputs=[sample_dropdown_b], outputs=[in_text_b], ) def process_and_calculate(text_a, text_b, nbits, max_tokens): log.debug(f"Processing text_a: {text_a[:20]}, text_b: {text_b[:20]}") def process_single_text(text, suffix): out_code_func = globals().get(f"out_code_{suffix}") out_chunks_func = globals().get(f"out_chunks_{suffix}") if not text: return { out_code_func: gr.Textbox(value=None), out_chunks_func: gr.HighlightedText( value=None, elem_id=f"chunked-text-{suffix}" ), } result = sct.gen_text_code_semantic( text, bits=nbits, simprints=True, offsets=True, sizes=True, contents=True, max_tokens=max_tokens, ) iscc = sct.Metadata(**result).to_object_format() # Generate chunked text with simprints and overlaps features = iscc.features[0] highlighted_chunks = [] overlaps = iscc.get_overlaps() for i, feature in enumerate(features.simprints): feature: sct.Feature content = feature.content # Remove leading overlap if i > 0 and overlaps[i - 1]: content = content[len(overlaps[i - 1]) :] # Remove trailing overlap if i < len(overlaps) and overlaps[i]: content = content[: -len(overlaps[i])] label = f"{feature.size}:{feature.simprint}" highlighted_chunks.append((no_nl_inner(content), label)) if i < len(overlaps): overlap = overlaps[i] if overlap: highlighted_chunks.append((f"\n{no_nl(overlap)}\n", "overlap")) return { out_code_func: gr.Textbox(value=iscc.iscc), out_chunks_func: gr.HighlightedText( value=highlighted_chunks, elem_id=f"chunked-text-{suffix}" ), "metadata": iscc, } result_a = process_single_text(text_a, "a") result_b = process_single_text(text_b, "b") code_a = result_a[out_code_a] if text_a else None code_b = result_b[out_code_b] if text_b else None similarity = compare_codes(code_a, code_b, nbits) or out_similarity granular_matches = [] if text_a and text_b: matches = sct.granular_similarity( result_a["metadata"], result_b["metadata"], threshold=80 ) for match in matches: granular_matches.append( [ match[0].content, f"{match[1]}%", match[2].content, ] ) return ( result_a[out_code_a], result_a[out_chunks_a], result_b[out_code_b], result_b[out_chunks_b], similarity, gr.Dataframe(value=granular_matches), ) in_text_a.change( process_and_calculate, inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens], outputs=[ out_code_a, out_chunks_a, out_code_b, out_chunks_b, out_similarity, in_granular_matches, ], show_progress="full", trigger_mode="always_last", ) in_text_b.change( process_and_calculate, inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens], outputs=[ out_code_a, out_chunks_a, out_code_b, out_chunks_b, out_similarity, in_granular_matches, ], show_progress="full", trigger_mode="always_last", ) in_iscc_bits.change( process_and_calculate, inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens], outputs=[ out_code_a, out_chunks_a, out_code_b, out_chunks_b, out_similarity, in_granular_matches, ], show_progress="full", ) in_max_tokens.change( process_and_calculate, inputs=[in_text_a, in_text_b, in_iscc_bits, in_max_tokens], outputs=[ out_code_a, out_chunks_a, out_code_b, out_chunks_b, out_similarity, in_granular_matches, ], show_progress="full", ) out_code_a.change( compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity] ) out_code_b.change( compare_codes, inputs=[out_code_a, out_code_b, in_iscc_bits], outputs=[out_similarity] ) def reset_all(): return ( gr.Slider(value=64), # Reset ISCC Bit-Length gr.Dropdown( value="None", choices=["None"] + [lang for lang in samples["a"]] ), # Reset sample dropdown A gr.Dropdown( value="None", choices=["None"] + [lang for lang in samples["b"]] ), # Reset sample dropdown B gr.TextArea(value=""), # Reset Text A gr.TextArea(value=""), # Reset Text B gr.Textbox(value=""), # Reset ISCC Code for Text A gr.Textbox(value=""), # Reset ISCC Code for Text B gr.HTML(value=""), # Reset Similarity gr.HighlightedText(value=[]), # Reset Chunked Text A gr.HighlightedText(value=[]), # Reset Chunked Text B ) reset_button.click( reset_all, outputs=[ in_iscc_bits, sample_dropdown_a, sample_dropdown_b, in_text_a, in_text_b, out_code_a, out_code_b, out_similarity, out_chunks_a, out_chunks_b, ], ) with gr.Row(variant="panel"): gr.Markdown( """ ## Understanding ISCC Semantic Text-Codes ### What is an ISCC Semantic Text-Code? An ISCC Semantic Text-Code is a digital fingerprint for text content. It captures the meaning of the text, not just the exact words. Technically it is am ISCC-encoded, binarized multi-lingual document-embedding. ### How does it work? 1. **Input**: You provide a text in any language. 2. **Processing**: Vector embeddings are created for individual chunks of the text. 3. **Output**: A unique ISCC-UNIT is generated that represents the entire text's content. ### What can it do? - **Cross-language matching**: It can recognize similar content across different languages. - **Similarity detection**: It can measure how similar two texts are in meaning, not just in words. - **Content identification**: It can help identify texts with similar content, even if the wording is different. ### How to use this demo: 1. **Enter text**: Type or paste text into either or both text boxes. 2. **Adjust bit length**: Use the slider to change the detail level of the code (higher = more detailed). 3. **View results**: See the generated ISCC code for each text. 4. **Compare**: Look at the similarity bar to see how alike the two texts are in meaning, based on their ISCC codes. ### Important Note: The similarity shown is calculated by comparing the ISCC codes, not the original texts. This allows for efficient and privacy-preserving comparisons, as only the codes need to be shared or stored. """ ) gr.Markdown( """ ### Why is this useful? - **Content creators**: Find similar content across languages. - **Researchers**: Quickly compare documents or find related texts in different languages. - **Publishers**: Identify potential translations or similar works efficiently. This technology opens up new possibilities for understanding and managing text content across language barriers! ### Explore Details & Advanced Options The "Explore Details & Advanced Options" section provides additional tools and information: 1. **ISCC Bit-Length**: Adjust the precision of the ISCC code. Higher values provide more detailed comparisons but may be more sensitive to minor differences. 2. **Max Tokens**: Set the maximum number of tokens per chunk. This affects how the text is split for processing. 3. **Chunked Text**: View how each input text is divided into chunks for processing. Each chunk is color-coded and labeled with its size and simprint (a similarity preserving fingerprint). 4. **Granular Matches**: See a detailed comparison of individual chunks between Text A and Text B. This table shows which specific parts of the texts are most similar (above 80%), along with their approximate cosine similarity (scaled -100% to +100%). For more information about the **ISCC** see: - https://github.com/iscc - https://iscc.codes - https://iscc.io - [ISO 24138:2024](https://www.iso.org/standard/77899.html) """ ) with gr.Row(): gr.Markdown( f"iscc-sct v{sct.__version__} | Source Code: https://github.com/iscc/iscc-sct", elem_classes="footer", ) if __name__ == "__main__": # pragma: no cover demo.launch()