|
""" |
|
# Semantic-Code Text - Datamodel |
|
|
|
This module provides the pydantic metadata schema for Semantic Text Code results. |
|
The schema is conformant with https://schema.iscc.codes/ |
|
|
|
The `features` property of the top level Metadata Object supports two different formats for |
|
representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**. |
|
These formats are designed to offer flexibility in how feature data is structured and processed, |
|
catering to different use cases where either performance or clarity is prioritized. |
|
|
|
## Features Index-Format (Compact Array Structure): |
|
|
|
In this compact format, features are represented as a list of strings, with optional parallel arrays to |
|
store related attributes such as `offsets`, `sizes`, and `contents`. |
|
|
|
**Example**: |
|
|
|
```json |
|
{ |
|
"maintype": "semantic", |
|
"subtype": "text", |
|
"version": 0, |
|
"simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"], |
|
"offsets": [0, 12], |
|
"sizes": [12, 48], |
|
"contents": ["textchunk no one", "textchunk no two"] |
|
} |
|
|
|
``` |
|
|
|
**Use Case**: |
|
- Best suited for scenarios where storage efficiency is critical, and the overhead of processing |
|
multiple parallel arrays is acceptable. |
|
- Useful when all features share the same set of attributes, allowing for faster bulk processing. |
|
|
|
## Features Object-Format (Self-Descriptive Object Structure): |
|
|
|
In this convenient format, each feature is represented as an individual object containing its |
|
attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but |
|
easier to read and work with. |
|
|
|
**Example**: |
|
|
|
```json |
|
{ |
|
"maintype": "content", |
|
"subtype": "text", |
|
"version": 0, |
|
"simprints": [ |
|
{ |
|
"simprint": "lUjuScFYBik", |
|
"offset": 0, |
|
"size": 25, |
|
"content": "ISCC - Semantic Text-Code" |
|
} |
|
] |
|
} |
|
|
|
``` |
|
**Use Case**: |
|
- Ideal for scenarios where clarity and readability are prioritized. |
|
- Each feature is self-contained, making it easier to understand, extend, and debug. |
|
- Flexibility in including or omitting optional attributes per feature. |
|
|
|
|
|
### Unified FeatureSet Schema: |
|
|
|
The `FeatureSet` model unifies these two formats by allowing either structure to be used. |
|
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format. |
|
""" |
|
|
|
from typing import List, Optional, Union |
|
from pydantic import BaseModel |
|
|
|
|
|
__all__ = ["Feature", "FeatureSet", "Metadata"] |
|
|
|
|
|
class PrettyBaseModel(BaseModel): |
|
def __repr__(self): |
|
return self.pretty_repr() |
|
|
|
def pretty_repr(self): |
|
return self.model_dump_json( |
|
indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False |
|
) |
|
|
|
|
|
class Feature(PrettyBaseModel): |
|
simprint: str |
|
offset: Optional[int] = None |
|
size: Optional[int] = None |
|
content: Optional[str] = None |
|
|
|
|
|
class FeatureSet(PrettyBaseModel): |
|
maintype: str = "semantic" |
|
subtype: str = "text" |
|
version: int = 0 |
|
embedding: Optional[List[float]] = None |
|
simprints: Optional[ |
|
Union[ |
|
List[str], |
|
List[Feature], |
|
] |
|
] = None |
|
offsets: Optional[List[int]] = None |
|
sizes: Optional[List[int]] = None |
|
contents: Optional[List[str]] = None |
|
|
|
|
|
class Metadata(PrettyBaseModel): |
|
iscc: str |
|
characters: Optional[int] = None |
|
features: Optional[List[FeatureSet]] = None |
|
|
|
def to_index_format(self) -> "Metadata": |
|
""" |
|
Convert the Metadata object to use the Index-Format for features. |
|
Returns a new Metadata object. |
|
""" |
|
if not self.features: |
|
return self.model_copy() |
|
|
|
new_features = [] |
|
for feature_set in self.features: |
|
new_feature_set = feature_set.model_copy() |
|
if feature_set.simprints is None: |
|
new_features.append(new_feature_set) |
|
continue |
|
|
|
if isinstance(feature_set.simprints[0], str): |
|
new_features.append(new_feature_set) |
|
else: |
|
new_feature_set.simprints = [f.simprint for f in feature_set.simprints] |
|
new_feature_set.offsets = [ |
|
f.offset for f in feature_set.simprints if f.offset is not None |
|
] |
|
new_feature_set.sizes = [ |
|
f.size for f in feature_set.simprints if f.size is not None |
|
] |
|
new_feature_set.contents = [ |
|
f.content for f in feature_set.simprints if f.content is not None |
|
] |
|
new_features.append(new_feature_set) |
|
|
|
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features) |
|
|
|
def get_content(self) -> Optional[str]: |
|
""" |
|
Reconstruct and return the original input text if all necessary data is available. |
|
This method removes overlaps in adjacent text chunks. |
|
|
|
:return: The reconstructed original text, or None if the necessary data is not available. |
|
""" |
|
if not self.features or not self.features[0].simprints: |
|
return None |
|
|
|
feature_set = self.features[0] |
|
if isinstance(feature_set.simprints[0], str): |
|
|
|
feature_set = self.to_object_format().features[0] |
|
|
|
if not all( |
|
feature.content and feature.offset is not None for feature in feature_set.simprints |
|
): |
|
return None |
|
|
|
|
|
sorted_features = sorted(feature_set.simprints, key=lambda x: x.offset) |
|
|
|
reconstructed_text = "" |
|
last_end = 0 |
|
|
|
for feature in sorted_features: |
|
start = feature.offset |
|
if start < last_end: |
|
|
|
feature_content = feature.content[last_end - start :] |
|
else: |
|
feature_content = feature.content |
|
|
|
reconstructed_text += feature_content |
|
last_end = start + len(feature.content) |
|
|
|
return reconstructed_text |
|
|
|
def get_overlaps(self) -> List[str]: |
|
""" |
|
Returns a list of overlapping text between consecutive chunks. |
|
For non-overlapping consecutive chunks, returns an empty string. |
|
|
|
:return: List of overlapping text or empty strings. |
|
""" |
|
if not self.features or not self.features[0].simprints: |
|
return [] |
|
|
|
feature_set = self.features[0] |
|
if isinstance(feature_set.simprints[0], str): |
|
|
|
feature_set = self.to_object_format().features[0] |
|
|
|
if not all( |
|
feature.content and feature.offset is not None for feature in feature_set.simprints |
|
): |
|
return [] |
|
|
|
|
|
sorted_features = sorted(feature_set.simprints, key=lambda x: x.offset) |
|
overlaps = [] |
|
|
|
for i in range(len(sorted_features) - 1): |
|
current_feature = sorted_features[i] |
|
next_feature = sorted_features[i + 1] |
|
|
|
current_end = current_feature.offset + len(current_feature.content) |
|
next_start = next_feature.offset |
|
|
|
if current_end > next_start: |
|
overlap = current_feature.content[next_start - current_feature.offset :] |
|
overlaps.append(overlap) |
|
else: |
|
overlaps.append("") |
|
|
|
return overlaps |
|
|
|
def to_object_format(self) -> "Metadata": |
|
""" |
|
Convert the Metadata object to use the Object-Format for features. |
|
Returns a new Metadata object. |
|
""" |
|
if not self.features: |
|
return self.model_copy() |
|
|
|
new_features = [] |
|
for feature_set in self.features: |
|
new_feature_set = feature_set.model_copy() |
|
if feature_set.simprints is None: |
|
new_features.append(new_feature_set) |
|
continue |
|
|
|
if isinstance(feature_set.simprints[0], Feature): |
|
new_features.append(new_feature_set) |
|
else: |
|
new_simprints = [] |
|
for i, simprint in enumerate(feature_set.simprints): |
|
feature = Feature(simprint=simprint) |
|
if feature_set.offsets and i < len(feature_set.offsets): |
|
feature.offset = feature_set.offsets[i] |
|
if feature_set.sizes and i < len(feature_set.sizes): |
|
feature.size = feature_set.sizes[i] |
|
if feature_set.contents and i < len(feature_set.contents): |
|
feature.content = feature_set.contents[i] |
|
new_simprints.append(feature) |
|
new_feature_set.simprints = new_simprints |
|
new_feature_set.offsets = None |
|
new_feature_set.sizes = None |
|
new_feature_set.contents = None |
|
new_features.append(new_feature_set) |
|
|
|
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features) |
|
|