MTC / src /utils.py
userlocallm's picture
Upload 17 files
500516e verified
# src/utils.py
import os
import json
from src.memory import MemoryManager # Corrected import path
from src.llm_interface import LLMInterface # Import LLMInterface
import logging
import spacy
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def chunk_text(text, chunk_size=1000, overlap=100):
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunks.append(text[start:end])
start = end - overlap
return chunks
def extract_and_summarize(query: str, memory_manager: MemoryManager, llm_interface: LLMInterface, system_prompt: str = "", max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str:
# Retrieve relevant memories from the database
relevant_memories = memory_manager.retrieve_relevant_memories(query, limit=30)
logging.info(f"Retrieved {len(relevant_memories)} relevant memories for query: {query}")
# Combine relevant memories into a single context
context = " ".join([memory['description'] for memory in relevant_memories])
logging.info(f"Built context: {context}")
# Truncate the context if it exceeds the token limit
max_context_length = 30000 # Adjust this based on your LLM's token limit
if len(context) > max_context_length:
context = context[:max_context_length]
logging.info(f"Truncated context to {max_context_length} characters.")
# Use spaCy to generate sentence embeddings
nlp = spacy.load('en_core_web_lg')
sentences = context.split('.')
sentence_embeddings = [nlp(sent).vector for sent in sentences]
# Cluster sentences
num_clusters = min(len(sentences), 10) # Adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(sentence_embeddings)
labels = kmeans.labels_
# Select representative sentences from each cluster
representative_sentences = []
for i in range(num_clusters):
cluster_sentences = [sentences[j] for j in range(len(sentences)) if labels[j] == i]
if cluster_sentences:
representative_sentences.append(max(cluster_sentences, key=len)) # Select the longest sentence as representative
# Combine representative sentences to form a summary
summary = " ".join(representative_sentences)
logging.info(f"Generated summary: {summary}")
# Use LLM to refine the summary
try:
refined_summary = llm_interface.send_message(f"Context: {summary}\nQuestion: {query}", system_prompt=system_prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p)
logging.info(f"Refined summary: {refined_summary}")
except Exception as e:
refined_summary = f"Error refining summary: {e}"
logging.error(f"Error refining summary: {e}")
return refined_summary