iscc-sct / iscc_sct /models.py
titusz's picture
Synced repo using 'sync_with_huggingface' Github Action
8c51bed verified
"""
# Semantic-Code Text - Datamodel
This module provides the pydantic metadata schema for Semantic Text Code results.
The schema is conformant with https://schema.iscc.codes/
The `features` property of the top level Metadata Object supports two different formats for
representing granular (per text chunk) features: the **Index-Format** and the **Object-Format**.
These formats are designed to offer flexibility in how feature data is structured and processed,
catering to different use cases where either performance or clarity is prioritized.
## Features Index-Format (Compact Array Structure):
In this compact format, features are represented as a list of strings, with optional parallel arrays to
store related attributes such as `offsets`, `sizes`, and `contents`.
**Example**:
```json
{
"maintype": "semantic",
"subtype": "text",
"version": 0,
"simprints": ["XZjeSfdyVi0", "NGrHC1F1Q-k"],
"offsets": [0, 12],
"sizes": [12, 48],
"contents": ["textchunk no one", "textchunk no two"]
}
```
**Use Case**:
- Best suited for scenarios where storage efficiency is critical, and the overhead of processing
multiple parallel arrays is acceptable.
- Useful when all features share the same set of attributes, allowing for faster bulk processing.
## Features Object-Format (Self-Descriptive Object Structure):
In this convenient format, each feature is represented as an individual object containing its
attributes (`feature`, `offset`, `size`, `content`). This makes the structure more verbose but
easier to read and work with.
**Example**:
```json
{
"maintype": "content",
"subtype": "text",
"version": 0,
"simprints": [
{
"simprint": "lUjuScFYBik",
"offset": 0,
"size": 25,
"content": "ISCC - Semantic Text-Code"
}
]
}
```
**Use Case**:
- Ideal for scenarios where clarity and readability are prioritized.
- Each feature is self-contained, making it easier to understand, extend, and debug.
- Flexibility in including or omitting optional attributes per feature.
### Unified FeatureSet Schema:
The `FeatureSet` model unifies these two formats by allowing either structure to be used.
To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
"""
from typing import List, Optional, Union
from pydantic import BaseModel
__all__ = ["Feature", "FeatureSet", "Metadata"]
class PrettyBaseModel(BaseModel):
def __repr__(self):
return self.pretty_repr()
def pretty_repr(self):
return self.model_dump_json(
indent=2, exclude_unset=True, exclude_none=True, exclude_defaults=False
)
class Feature(PrettyBaseModel):
simprint: str
offset: Optional[int] = None
size: Optional[int] = None
content: Optional[str] = None
class FeatureSet(PrettyBaseModel):
maintype: str = "semantic"
subtype: str = "text"
version: int = 0
embedding: Optional[List[float]] = None
simprints: Optional[
Union[
List[str], # Index-Format
List[Feature], # Object-Format
]
] = None
offsets: Optional[List[int]] = None
sizes: Optional[List[int]] = None
contents: Optional[List[str]] = None
class Metadata(PrettyBaseModel):
iscc: str
characters: Optional[int] = None
features: Optional[List[FeatureSet]] = None
def to_index_format(self) -> "Metadata":
"""
Convert the Metadata object to use the Index-Format for features.
Returns a new Metadata object.
"""
if not self.features:
return self.model_copy()
new_features = []
for feature_set in self.features:
new_feature_set = feature_set.model_copy()
if feature_set.simprints is None:
new_features.append(new_feature_set)
continue
if isinstance(feature_set.simprints[0], str):
new_features.append(new_feature_set)
else:
new_feature_set.simprints = [f.simprint for f in feature_set.simprints]
new_feature_set.offsets = [
f.offset for f in feature_set.simprints if f.offset is not None
]
new_feature_set.sizes = [
f.size for f in feature_set.simprints if f.size is not None
]
new_feature_set.contents = [
f.content for f in feature_set.simprints if f.content is not None
]
new_features.append(new_feature_set)
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)
def get_content(self) -> Optional[str]:
"""
Reconstruct and return the original input text if all necessary data is available.
This method removes overlaps in adjacent text chunks.
:return: The reconstructed original text, or None if the necessary data is not available.
"""
if not self.features or not self.features[0].simprints:
return None
feature_set = self.features[0]
if isinstance(feature_set.simprints[0], str):
# Convert to object format if in index format
feature_set = self.to_object_format().features[0]
if not all(
feature.content and feature.offset is not None for feature in feature_set.simprints
):
return None
# Sort features by offset
sorted_features = sorted(feature_set.simprints, key=lambda x: x.offset)
reconstructed_text = ""
last_end = 0
for feature in sorted_features:
start = feature.offset
if start < last_end:
# Remove overlap
feature_content = feature.content[last_end - start :]
else:
feature_content = feature.content
reconstructed_text += feature_content
last_end = start + len(feature.content)
return reconstructed_text
def get_overlaps(self) -> List[str]:
"""
Returns a list of overlapping text between consecutive chunks.
For non-overlapping consecutive chunks, returns an empty string.
:return: List of overlapping text or empty strings.
"""
if not self.features or not self.features[0].simprints:
return []
feature_set = self.features[0]
if isinstance(feature_set.simprints[0], str):
# Convert to object format if in index format
feature_set = self.to_object_format().features[0]
if not all(
feature.content and feature.offset is not None for feature in feature_set.simprints
):
return []
# Sort features by offset
sorted_features = sorted(feature_set.simprints, key=lambda x: x.offset)
overlaps = []
for i in range(len(sorted_features) - 1):
current_feature = sorted_features[i]
next_feature = sorted_features[i + 1]
current_end = current_feature.offset + len(current_feature.content)
next_start = next_feature.offset
if current_end > next_start:
overlap = current_feature.content[next_start - current_feature.offset :]
overlaps.append(overlap)
else:
overlaps.append("")
return overlaps
def to_object_format(self) -> "Metadata":
"""
Convert the Metadata object to use the Object-Format for features.
Returns a new Metadata object.
"""
if not self.features:
return self.model_copy()
new_features = []
for feature_set in self.features:
new_feature_set = feature_set.model_copy()
if feature_set.simprints is None:
new_features.append(new_feature_set)
continue
if isinstance(feature_set.simprints[0], Feature):
new_features.append(new_feature_set)
else:
new_simprints = []
for i, simprint in enumerate(feature_set.simprints):
feature = Feature(simprint=simprint)
if feature_set.offsets and i < len(feature_set.offsets):
feature.offset = feature_set.offsets[i]
if feature_set.sizes and i < len(feature_set.sizes):
feature.size = feature_set.sizes[i]
if feature_set.contents and i < len(feature_set.contents):
feature.content = feature_set.contents[i]
new_simprints.append(feature)
new_feature_set.simprints = new_simprints
new_feature_set.offsets = None
new_feature_set.sizes = None
new_feature_set.contents = None
new_features.append(new_feature_set)
return Metadata(iscc=self.iscc, characters=self.characters, features=new_features)