Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sentence_transformers import SentenceTransformer, util | |
from transformers import pipeline | |
import torch | |
import gradio as gr | |
import os | |
# Use the relative path where the CSV is uploaded | |
csv_file_path = os.path.join(os.getcwd(), 'Analytics_Vidhya_Free_Course_data.csv') | |
# Load the dataset | |
df = pd.read_csv(csv_file_path, encoding='ISO-8859-1') | |
# Load the pre-trained model for embeddings (using SentenceTransformers) | |
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1') | |
# Combine title and description to create a full text for each course | |
df['full_text'] = df.iloc[:,0] + " " + df.iloc[:,1] + " " + df['Instructor Name'] + " " + str(df['Rating']) + " " + df['Category'] | |
# Convert full course texts into embeddings | |
course_embeddings = model.encode(df['full_text'].tolist(), convert_to_tensor=True) | |
# Function to expand the query using paraphrasing | |
def expand_query(query): | |
paraphraser = pipeline('text2text-generation', model='Vamsi/T5_Paraphrase_Paws') | |
expanded_queries = paraphraser(query, num_return_sequences=3, max_length=50, do_sample=True) | |
return [q['generated_text'] for q in expanded_queries] | |
# Function to search for the most relevant courses | |
def search_courses(query, level_filter=None, category_filter=None, top_k=3): | |
# Step 1: Expand the query using paraphrasing | |
expanded_queries = expand_query(query) | |
# Step 2: Initialize an array to store all similarities | |
all_similarities = [] | |
for expanded_query in expanded_queries: | |
# Convert each expanded query into an embedding | |
query_embedding = model.encode(expanded_query, convert_to_tensor=True) | |
# Compute cosine similarities between the query embedding and course embeddings | |
similarities = util.pytorch_cos_sim(query_embedding, course_embeddings)[0] | |
# Append to the list of all similarities | |
all_similarities.append(similarities) | |
# Step 3: Convert the list of tensors to a single tensor by taking the maximum similarity for each course | |
aggregated_similarities = torch.max(torch.stack(all_similarities), dim=0)[0] | |
# Step 4: Apply filters | |
filtered_df = df.copy() | |
if level_filter: | |
filtered_df = filtered_df[filtered_df['Level of Difficulty'] == level_filter] | |
if category_filter: | |
filtered_df = filtered_df[filtered_df['Category'] == category_filter] | |
if filtered_df.empty: | |
return "<p>No matching courses found.</p>" | |
# Recalculate similarities for the filtered data | |
filtered_similarities = aggregated_similarities[filtered_df.index] | |
# Step 5: Get top_k most similar courses | |
top_results = filtered_similarities.topk(k=min(top_k, len(filtered_similarities))) | |
# Prepare the output as clickable links | |
results = [] | |
for idx in top_results.indices: | |
idx = int(idx) | |
course_title = filtered_df.iloc[idx]['Course Title'] | |
course_description = filtered_df.iloc[idx,1] | |
course_url = filtered_df.iloc[idx,-1] | |
# Format the result as a clickable hyperlink using raw HTML | |
course_link = f'<a href="{course_url}" target="_blank">{course_title}</a>' | |
results.append(f"<strong>{course_link}</strong><br>{course_description}<br><br>") | |
# Combine all results into an HTML formatted list | |
return "<ol>" + "".join([f"<li>{result}</li>" for result in results]) + "</ol>" | |
# Create Gradio UI | |
def create_gradio_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Analytics Vidhya Free Courses") | |
gr.Markdown("Enter your query and use filters to narrow down the search.") | |
# Input elements | |
query = gr.Textbox(label=" Search for a course", placeholder="Enter course topic or description") | |
# Filters (in a collapsible form) | |
with gr.Accordion(" Filters", open=False): | |
level_filter = gr.Dropdown(choices=["Beginner", "Intermediate", "Advanced"], label=" Course Level", multiselect=False) | |
category_filter = gr.Dropdown(choices=["Data Science", "Machine Learning", "Deep Learning", "AI", "NLP"], label=" Category", multiselect=False) | |
# Search button | |
search_button = gr.Button("Search") | |
# Output HTML for displaying results | |
output = gr.HTML(label="Search Results") | |
# On button click, trigger the search function | |
search_button.click(fn=search_courses, inputs=[query, level_filter, category_filter], outputs=output) | |
return demo | |
# Launch Gradio interface | |
demo = create_gradio_interface() | |
demo.launch(share=True, debug=True) |