import os, warnings from dotenv import load_dotenv from schemas import * os.environ["CURL_CA_BUNDLE"] = "" warnings.filterwarnings("ignore") load_dotenv() from datasets import load_dataset import bm25s from bm25s.hf import BM25HF from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles from sklearn.preprocessing import MinMaxScaler import numpy as np import litellm bm25_index = BM25HF.load_from_hub("OrganizedProgrammers/3GPPBM25IndexSections", load_corpus=True, token=os.environ["HF_TOKEN"]) app = FastAPI(title="RAGnarok", description="Speak with the specifications") app.mount("/static", StaticFiles(directory="static"), name="static") origins = [ "*", ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.get("/") def main_menu(): return FileResponse(os.path.join("templates", "index.html")) @app.post("/search", response_model=SearchResponse) def search_specifications(req: SearchRequest): keywords = req.keyword threshold = req.threshold results_out = [] query_tokens = bm25s.tokenize(keywords) results, scores = bm25_index.retrieve(query_tokens, k=len(bm25_index.corpus)) def calculate_boosted_score(metadata, score, query): title = set(metadata['title'].lower().split()) q = set(query.lower().split()) spec_id_presence = 0.5 if metadata['id'].lower() in q else 0 booster = len(q & title) * 0.5 return score + spec_id_presence + booster spec_scores = {} spec_indices = {} spec_details = {} for i in range(results.shape[1]): doc = results[0, i] score = scores[0, i] spec = doc["metadata"]["id"] boosted_score = calculate_boosted_score(doc['metadata'], score, keywords) if spec not in spec_scores or boosted_score > spec_scores[spec]: spec_scores[spec] = boosted_score spec_indices[spec] = i spec_details[spec] = { 'original_score': score, 'boosted_score': boosted_score, 'doc': doc } def normalize_scores(scores_dict): if not scores_dict: return {} scores_array = np.array(list(scores_dict.values())).reshape(-1, 1) scaler = MinMaxScaler() normalized_scores = scaler.fit_transform(scores_array).flatten() normalized_dict = {} for i, spec in enumerate(scores_dict.keys()): normalized_dict[spec] = normalized_scores[i] return normalized_dict normalized_scores = normalize_scores(spec_scores) for spec in spec_details: spec_details[spec]["normalized_score"] = normalized_scores[spec] unique_specs = sorted(normalized_scores.keys(), key=lambda x: normalized_scores[x], reverse=True) for rank, spec in enumerate(unique_specs, 1): details = spec_details[spec] metadata = details['doc']['metadata'] if details['normalized_score'] < threshold / 100: break results_out.append({'id': metadata['id'], 'title': metadata['title'], 'section': metadata['section_title'], 'content': details['doc']['text'], 'similarity': int(details['normalized_score']*100)}) return SearchResponse(results=results_out) @app.post("/chat", response_model=ChatResponse) def questions_the_sources(req: ChatRequest): model = req.model resp = litellm.completion( model=f"gemini/{model}", messages=req.messages, api_key=os.environ["GEMINI"] ) return ChatResponse(response=resp.choices[0].message.content)