import streamlit as st import pandas as pd import os from langchain_core.prompts import PromptTemplate import json from langchain_openai import ChatOpenAI import evaluate from typing import List, Dict from prompts_v1 import * import tempfile from langchain_groq import ChatGroq import os os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY') os.environ["GROQ_API_KEY"]= os.getenv('GROQ_API_KEY') # Configure page settings st.set_page_config( page_title="RAG Evaluator", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS for better UI st.markdown(""" """, unsafe_allow_html=True) # Initialize session state if 'evaluation_results' not in st.session_state: st.session_state.evaluation_results = None class RAGEvaluator: def __init__(self): #self.llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0.2) self.llm = ChatOpenAI(temperature=0.7, model="gpt-3.5-turbo") self.eval_prompts = { "diversity_metrics": diversity_metrics, "creativity_metric": creativity_metric, "groundedness_metric": groundedness_metric, "coherence_metric": coherence_metric, "pointwise_metric":pointwise_metric, # "pairwise_metric":pairwise_metric } def evaluate_custom_metrics(self, df: pd.DataFrame, selected_metrics: List[str]) -> pd.DataFrame: for metric in selected_metrics: prompt = self.eval_prompts.get(metric) if not prompt: continue review_template = PromptTemplate.from_template(prompt) eval_score = [] explanation = [] progress_bar = st.progress(0) for idx in range(len(df)): progress = (idx + 1) / len(df) progress_bar.progress(progress) question = df["question"][idx] answer = df["answer"][idx] context = df["context"][idx] final_prompt = review_template.format( question=question, answer=answer, context=context ) response = self.llm.invoke(final_prompt).content data_dict = json.loads(response) eval_score.append(data_dict["eval_score"]) explanation.append(data_dict["explanation"]) df[f"{metric}_score"] = eval_score df[f"{metric}_explanation"] = explanation progress_bar.empty() return df def evaluate_traditional_metrics(self, df: pd.DataFrame, selected_metrics: List[str]) -> pd.DataFrame: if "BLEU" in selected_metrics: bleu = evaluate.load('bleu') scores = [] for _, row in df.iterrows(): score = bleu.compute( predictions=[row['answer']], references=[row['context']], max_order=2 ) scores.append(score['bleu']) df['bleu_score'] = scores if "ROUGE" in selected_metrics: rouge = evaluate.load("rouge") rouge1_scores = [] rouge2_scores = [] rougeL_scores = [] for _, row in df.iterrows(): scores = rouge.compute( predictions=[row['answer']], references=[row['context']], rouge_types=['rouge1', 'rouge2', 'rougeL'] ) rouge1_scores.append(scores['rouge1']) rouge2_scores.append(scores['rouge2']) rougeL_scores.append(scores['rougeL']) df['rouge1_score'] = rouge1_scores df['rouge2_score'] = rouge2_scores df['rougeL_score'] = rougeL_scores if "Perplexity" in selected_metrics: try: perplexity = evaluate.load("perplexity", module_type="metric") scores = [] for _, row in df.iterrows(): try: score = perplexity.compute( model_id="gpt2", add_start_token=False, predictions=[row['answer']] ) scores.append(score['mean_perplexity']) except KeyError: # If mean_perplexity is not available, try perplexity scores.append(score.get('perplexity', 0)) except Exception as e: st.warning(f"Skipping perplexity calculation for one row due to: {str(e)}") scores.append(0) df['perplexity_score'] = scores except Exception as e: st.error(f"Error calculating perplexity: {str(e)}") df['perplexity_score'] = [0] * len(df) return df def main(): st.title("🎯 RAG Evaluator") st.write("Upload your data and select evaluation metrics to analyze your RAG system's performance.") # Sidebar configuration st.sidebar.header("Configuration") # File upload uploaded_file = st.sidebar.file_uploader( "Upload your evaluation data (CSV/Excel)", type=['csv', 'xlsx'] ) # Metric selection st.sidebar.subheader("Select Evaluation Metrics") custom_metrics = st.sidebar.expander("Custom Metrics", expanded=True) selected_custom_metrics = custom_metrics.multiselect( "Choose custom metrics:", ["diversity_metrics", "creativity_metric", "groundedness_metric", "coherence_metric","pointwise_metric"], default=["coherence_metric"] ) traditional_metrics = st.sidebar.expander("Traditional Metrics", expanded=True) selected_traditional_metrics = traditional_metrics.multiselect( "Choose traditional metrics:", ["BLEU", "ROUGE", "Perplexity"], default=["BLEU"] ) if uploaded_file is not None: try: # Read the uploaded file if uploaded_file.name.endswith('.csv'): df = pd.read_csv(uploaded_file) else: df = pd.read_excel(uploaded_file) # Display data preview st.subheader("📊 Data Preview") st.dataframe(df.head(), use_container_width=True) # Initialize evaluator evaluator = RAGEvaluator() # Evaluation button if st.button("🚀 Start Evaluation", type="primary"): with st.spinner("Evaluating..."): # Perform evaluations if selected_custom_metrics: df = evaluator.evaluate_custom_metrics(df, selected_custom_metrics) if selected_traditional_metrics: df = evaluator.evaluate_traditional_metrics(df, selected_traditional_metrics) st.session_state.evaluation_results = df # Save results with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp: df.to_excel(tmp.name, index=False) st.download_button( label="📥 Download Results", data=open(tmp.name, 'rb'), file_name="rag_evaluation_results.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Display results if available if st.session_state.evaluation_results is not None: st.subheader("📈 Evaluation Results") # Create tabs for different result views tab1, tab2 = st.tabs(["📊 Metrics Overview", "📝 Detailed Results"]) with tab1: # Display metric summaries cols = st.columns(len(selected_custom_metrics) + len(selected_traditional_metrics)) metric_idx = 0 for metric in selected_custom_metrics: with cols[metric_idx]: avg_score = st.session_state.evaluation_results[f"{metric}_score"].mean() st.metric( label=metric.replace('_', ' ').title(), value=f"{avg_score:.2f}" ) metric_idx += 1 if "BLEU" in selected_traditional_metrics: with cols[metric_idx]: avg_bleu = st.session_state.evaluation_results['bleu_score'].mean() st.metric(label="BLEU Score", value=f"{avg_bleu:.2f}") metric_idx += 1 if "ROUGE" in selected_traditional_metrics: with cols[metric_idx]: avg_rouge = st.session_state.evaluation_results['rouge1_score'].mean() st.metric(label="ROUGE-1 Score", value=f"{avg_rouge:.2f}") metric_idx += 1 if "Perplexity" in selected_traditional_metrics: with cols[metric_idx]: avg_rouge = st.session_state.evaluation_results['perplexity_score'].mean() st.metric(label="perplexity Score", value=f"{avg_rouge:.2f}") metric_idx += 1 with tab2: st.dataframe( st.session_state.evaluation_results, use_container_width=True, height=400 ) except Exception as e: st.error(f"An error occurred: {str(e)}") else: # Display welcome message and instructions st.info("👈 Please upload your evaluation data file (CSV/Excel) from the sidebar to begin.") # Display sample format st.subheader("📋 Expected Data Format") sample_data = pd.DataFrame({ 'question': ['What is RAG?', 'How does RAG work?'], 'answer': ['RAG is...', 'RAG works by...'], 'context': ['RAG (Retrieval-Augmented Generation)...', 'The RAG process involves...'] }) st.dataframe(sample_data, use_container_width=True) if __name__ == "__main__": main()