# clustering edgar - example

- embeddings from https://huggingface.co/BEE-spoke-data/mega-small-embed-syntheticSTS-16384 (on first 16384 tokens)
- summaries on first 25,000 chars


---

In [None]:
#@markdown add auto-Colab formatting with `IPython.display`
from IPython.display import HTML, display
# colab formatting
def set_css():
    display(
        HTML(
            """
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  """
        )
    )

get_ipython().events.register("pre_run_cell", set_css)

In [None]:
!pip install -U -q sentence-transformers datasets

In [None]:
from huggingface_hub import notebook_login
# notebook_login()

In [None]:
from datasets import load_dataset

ds_name = "pszemraj/edgar-corpus-htm2020"
ds_config='embedding-all-distilroberta-v1'
dataset = load_dataset(ds_name, ds_config, num_proc=4 )
dataset

Downloading data:   0%|          | 0.00/22.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6505 [00:00<?, ? examples/s]

Setting num_proc from 4 back to 1 for the test split to disable multiprocessing as it only contains one shard.


Generating test split:   0%|          | 0/172 [00:00<?, ? examples/s]

Setting num_proc from 4 back to 1 for the validation split to disable multiprocessing as it only contains one shard.


Generating validation split:   0%|          | 0/170 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['filename', 'cik', 'text', 'embedding', 'keywords_body', 'display_text', 'summary_first_25k', '__index_level_0__'],
        num_rows: 6505
    })
    test: Dataset({
        features: ['filename', 'cik', 'text', 'embedding', 'keywords_body', 'display_text', 'summary_first_25k', '__index_level_0__'],
        num_rows: 172
    })
    validation: Dataset({
        features: ['filename', 'cik', 'text', 'embedding', 'keywords_body', 'display_text', 'summary_first_25k', '__index_level_0__'],
        num_rows: 170
    })
})

In [None]:
import torch

#@title normalize

def safe_normalize(embeddings, epsilon=1e-10) -> torch.Tensor:
    """
    Normalize embeddings using L2 norm and an epsilon to avoid division by zero.

    Args:
        embeddings (array-like): The embeddings tensor to normalize.
        epsilon (float): small value added to norms to avoid division by zero.

    Returns:
        torch.Tensor: The normalized embeddings.
    """
    embeddings = torch.Tensor(embeddings)
    # L2 norm, keep dimension for broadcasting
    norms = torch.norm(embeddings, p=2, dim=1, keepdim=True)

    # Add epsilon to norms to avoid division by zero
    norms = torch.clamp(norms, min=epsilon)

    return embeddings / norms


# Example usage
corpus_embeddings = safe_normalize(dataset["train"]["embedding"])
corpus_embeddings.shape

torch.Size([6505, 768])

In [None]:
import random
import re


def fast_wc(text):
    return len(re.findall(r"\w+", text))


sample = random.choice(dataset["train"])
textwc = fast_wc(sample["text"]) - fast_wc(sample["text"][:500])
print("text:\n", f"\n```\n{sample['text'][:500]} ...\n```\n and {textwc} more words")
print("\n", "-" * 80)
print("summary:\n\n", sample["summary_first_25k"])

text:
 
```
## SEC EDGAR Filing

- **Filename**: 1037646_2020.htm
- **CIK**: 1037646
- **Year**: 2020

---

### Business

Item 1.Business
We are a leading global supplier of precision instruments and services. We have strong leadership positions in all of our businesses and believe we hold global number-one market positions in most of them. We are recognized as an innovation leader and our solutions are critical in key research and development, quality control, and manufacturing processes for customers in a ...
```
 and 38917 more words

 --------------------------------------------------------------------------------
summary:

 The SEC's annual report on Form 10-K for the year ended December 31, 2020 is a compilation of financial statements and related notes. These reports provide detailed information about Mettler Toledo International, Inc., a leading global precision instruments and service provider, focusing on research and development in various industries. The company has strong 

In [None]:
!pip install clean-text[gpl] yake sentence-splitter -q
from cleantext import clean

In [None]:
# @title helper fns
import random

def sp(text: str, max_n: int = 45):
    """
    Shortens a given text to a specified maximum number of characters.

    Args:
        text: The text to be shortened.
        max_n: Maximum number of characters to keep.

    Returns:
        Shortened text with '...' appended if longer than max_n.
    """
    return text.strip() if len(str(text)) < max_n else text[:max_n].strip() + "..."

def print_cluster_summary(
    clusters,
    corpus_sentences,
    title="Cluster Summary",
    top_n=3,
    random_sample=False,
    max_n=45,
    max_clusters_printed=100
):
    """
    Prints details for each cluster, including its size and individual sentences
    from the top, bottom, and optionally random samples, with a limit on the total number
    of clusters printed.

    Args:
        clusters: List of clusters.
        corpus_sentences: The corpus of sentences.
        title: Title for the summary printout.
        top_n: Number of sentences to display from the top and bottom.
        random_sample: If True, prints 2*top_n random sentences from each cluster.
        max_n: Maximum number of characters for each sentence to be displayed.
        max_clusters_printed: Maximum number of clusters to print.
    """
    print(f"{title}:\n\ttotal clusters:\t{len(clusters)}\n")
    for i, cluster in enumerate(clusters[:max_clusters_printed]):
        print(f"Cluster #{i+1}, Size: {len(cluster)}")
        print("-" * 20)

        if random_sample and len(cluster) > 2 * top_n:
            print(f"Random {2*top_n} Sentences:")
            random_indices = random.sample(cluster, 2 * top_n)
            for sentence_id in random_indices:
                print(f"\t- {sp(corpus_sentences[sentence_id], max_n)}")
        else:
            print(f"Top {top_n} Sentences:")
            for sentence_id in cluster[:top_n]:
                print(f"\t- {sp(corpus_sentences[sentence_id], max_n)}")

            if len(cluster) > top_n:  # Ensure there are enough sentences for a bottom printout
                print(f"Bottom {top_n} Sentences:")
                for sentence_id in cluster[-top_n:]:
                    print(f"\t- {sp(corpus_sentences[sentence_id], max_n)}")
        print("\n" + "-" * 60)

        if i + 1 == max_clusters_printed:
            print(f"Printing limited to the first {max_clusters_printed} clusters.")
            break

In [None]:
import re
from sentence_splitter import split_text_into_sentences
#@title filter_and_join_sentences

def filter_and_join_sentences(text, n=2):
    """
    Filter out non-useful sentences from the text based on specific patterns indicating
    non-informative content (e.g., repetitive legal or formal disclosures), then join and
    return the first n useful sentences.

    :param text: String containing the original text.
    :param n: Number of useful sentences to return.
    :return: A string of the first n useful sentences joined together.
    """

    # Split text into sentences
    sentences = split_text_into_sentences(text, language='en')

    # Patterns for sentences to be excluded, targeting SEC-related disclosures and similar
    exclusion_patterns = [
        "The SEC's annual report on the Securities and Exchange Commission \(SEC\) is a compilation of",
        "covers various risk factors",
        "filed a Form 10-K",
        "SEC's filing",
        "SEC's Annual Report",
        "The SEC also provides",
        "Securities and Exchange Commission (SEC)",

    ]

    # Compiling regex patterns for efficiency
    compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in exclusion_patterns]

    def is_sentence_useful(sentence):
        """Check if a sentence is not matching exclusion patterns and thus considered useful."""
        return not any(pattern.search(sentence) for pattern in compiled_patterns)

    # Filter out non-useful sentences and limit to the first n sentences
    useful_sentences = [s for s in sentences if is_sentence_useful(s)][:n]

    # Join and return the useful sentences
    return ' '.join(useful_sentences)


In [None]:
from sentence_splitter import SentenceSplitter, split_text_into_sentences

dataset = dataset.map(
    lambda x: {
        "display_text":
        # " ".join(
        #     split_text_into_sentences(x["summary_first_25k"], language="en")[:2]
        # )
        filter_and_join_sentences(x["summary_first_25k"])
        + " ..."
        + f" cik: {x['cik']}"
    },
    num_proc=4,
)
display_text = dataset["train"]["display_text"]

Map (num_proc=4):   0%|          | 0/6505 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/172 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/170 [00:00<?, ? examples/s]

In [None]:
import csv
import os
import time

from sentence_transformers import SentenceTransformer, util

# Parameters
min_cluster_size = 6
threshold = 0.85
mcs = min_cluster_size
print(f"Start clustering: min_cluster_size={mcs}, threshold={threshold}")
start_time = time.time()


# Perform clustering
clusters = util.community_detection(
    corpus_embeddings, min_community_size=mcs, threshold=threshold
)

print(f"Clustering done after {time.time() - start_time:.2f} sec")

# Calculate the number of sentences not clustered
all_indices = set(range(len(corpus_embeddings)))  # Set of all sentence indices
for cluster in clusters:
    all_indices -= set(cluster)  # Remove indices that are in clusters

num_unclustered = len(all_indices)  # Number of sentences not in any cluster

# Now, all_indices contains the indices of sentences not in any cluster
print(f"Documents not clustered: {num_unclustered} of {len(corpus_embeddings)}")

Start clustering: min_cluster_size=6, threshold=0.85
Clustering done after 0.87 sec
Documents not clustered: 3817 of 6505


In [None]:
print(f"Params:\tmin_cluster_size={mcs}, threshold={threshold}")
print(f"Documents not clustered: {num_unclustered} of {len(corpus_embeddings)}")
print_cluster_summary(clusters, display_text, random_sample=True,
                      max_n=128)

Params:	min_cluster_size=6, threshold=0.85
Documents not clustered: 3817 of 6505
Cluster Summary:
	total clusters:	131

Cluster #1, Size: 590
--------------------
Random 6 Sentences:
	- In addition, the SEC has provided information regarding Wells Fargo Bank's compliance with applicable servicing criteria, which...
	- The report covers various aspects of the Trust, including business, risk factors, governance, financial statements, and market f...
	- The report also discusses the impact of litigation on the trustee and RMBS trusts, as well as U.S Bank National Association's la...
	- This SEC filing is a compilation of information from the Securities and Exchange Commission (SEC) regarding various aspects of t...
	- It also includes certain relationships and related transactions, director independence, executive compensation, security ownersh...
	- It also includes a summary of legal proceedings, an overview of U.S Bank National Association's lawsuit against it, and a brief...

--------