# src/utils.py import os import json from src.memory import MemoryManager # Corrected import path from src.llm_interface import LLMInterface # Import LLMInterface import logging import spacy from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def chunk_text(text, chunk_size=1000, overlap=100): chunks = [] start = 0 while start < len(text): end = start + chunk_size chunks.append(text[start:end]) start = end - overlap return chunks def extract_and_summarize(query: str, memory_manager: MemoryManager, llm_interface: LLMInterface, system_prompt: str = "", max_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95) -> str: # Retrieve relevant memories from the database relevant_memories = memory_manager.retrieve_relevant_memories(query, limit=30) logging.info(f"Retrieved {len(relevant_memories)} relevant memories for query: {query}") # Combine relevant memories into a single context context = " ".join([memory['description'] for memory in relevant_memories]) logging.info(f"Built context: {context}") # Truncate the context if it exceeds the token limit max_context_length = 30000 # Adjust this based on your LLM's token limit if len(context) > max_context_length: context = context[:max_context_length] logging.info(f"Truncated context to {max_context_length} characters.") # Use spaCy to generate sentence embeddings nlp = spacy.load('en_core_web_lg') sentences = context.split('.') sentence_embeddings = [nlp(sent).vector for sent in sentences] # Cluster sentences num_clusters = min(len(sentences), 10) # Adjust the number of clusters kmeans = KMeans(n_clusters=num_clusters) kmeans.fit(sentence_embeddings) labels = kmeans.labels_ # Select representative sentences from each cluster representative_sentences = [] for i in range(num_clusters): cluster_sentences = [sentences[j] for j in range(len(sentences)) if labels[j] == i] if cluster_sentences: representative_sentences.append(max(cluster_sentences, key=len)) # Select the longest sentence as representative # Combine representative sentences to form a summary summary = " ".join(representative_sentences) logging.info(f"Generated summary: {summary}") # Use LLM to refine the summary try: refined_summary = llm_interface.send_message(f"Context: {summary}\nQuestion: {query}", system_prompt=system_prompt, max_tokens=max_tokens, temperature=temperature, top_p=top_p) logging.info(f"Refined summary: {refined_summary}") except Exception as e: refined_summary = f"Error refining summary: {e}" logging.error(f"Error refining summary: {e}") return refined_summary