Spaces:
Sleeping
Sleeping
# app.py | |
import gradio as gr | |
import json | |
import re | |
import os | |
from datetime import datetime | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
from huggingface_hub import login | |
# First, login with the Hugging Face token from secrets | |
try: | |
hf_token = os.environ.get('HUGGINGFACE_TOKEN') | |
if hf_token: | |
login(token=hf_token) | |
else: | |
raise ValueError("HUGGINGFACE_TOKEN not found in environment variables") | |
except Exception as e: | |
print(f"Error during Hugging Face login: {str(e)}") | |
raise | |
class TranscriptAnalyzer: | |
def __init__(self): | |
try: | |
# Initialize the model and tokenizer with auth token | |
self.model_name = "microsoft/Phi-3.5-mini-instruct" | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
self.model_name, | |
use_auth_token=hf_token, | |
trust_remote_code=True | |
) | |
self.model = AutoModelForCausalLM.from_pretrained( | |
self.model_name, | |
use_auth_token=hf_token, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
trust_remote_code=True | |
) | |
except Exception as e: | |
print(f"Error initializing model: {str(e)}") | |
raise | |
def extract_dates(self, text: str): | |
date_patterns = [ | |
r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', | |
r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', | |
r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' | |
] | |
dates = [] | |
for pattern in date_patterns: | |
matches = re.finditer(pattern, text) | |
for match in matches: | |
dates.append(match.group()) | |
return dates | |
def extract_claim_numbers(self, text: str): | |
claim_patterns = [ | |
r'claim\s+#?\s*\d+[-\w]*', | |
r'#\s*\d+[-\w]*', | |
r'case\s+#?\s*\d+[-\w]*' | |
] | |
claims = [] | |
for pattern in claim_patterns: | |
matches = re.finditer(pattern, text, re.IGNORECASE) | |
for match in matches: | |
claims.append(match.group()) | |
return claims | |
def generate_prompt(self, transcript: str): | |
dates = self.extract_dates(transcript) | |
claims = self.extract_claim_numbers(transcript) | |
return f"""<s>[INST] Please analyze this meeting transcript with extreme precision and provide a structured analysis. | |
Remember to: | |
1. Only include information explicitly stated | |
2. Mark unclear information as "UNCLEAR" | |
3. Preserve exact numbers, dates, and claims | |
4. Focus on factual content | |
Identified dates: {', '.join(dates) if dates else 'None'} | |
Identified claims: {', '.join(claims) if claims else 'None'} | |
Please analyze: | |
{transcript} | |
Provide your analysis in this format: | |
PARTICIPANTS: | |
- List participants and their roles | |
CONTEXT: | |
- Meeting purpose | |
- Duration (if mentioned) | |
KEY POINTS: | |
- Main topics | |
- Decisions made | |
- Important numbers/metrics | |
ACTION ITEMS: | |
- Tasks and assignments | |
- Deadlines | |
- Responsible parties | |
FOLLOW UP: | |
- Next meetings | |
- Pending items [/INST]</s>""" | |
def analyze_transcript(self, transcript: str): | |
try: | |
# Generate prompt | |
prompt = self.generate_prompt(transcript) | |
# Tokenize input | |
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) | |
# Generate response | |
with torch.no_grad(): | |
outputs = self.model.generate( | |
**inputs, | |
max_new_tokens=1000, | |
temperature=0.1, | |
do_sample=True, | |
pad_token_id=self.tokenizer.eos_token_id | |
) | |
# Decode response | |
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Extract the assistant's response (after the prompt) | |
response = response.split("[/INST]")[-1].strip() | |
return response | |
except Exception as e: | |
return f"Error analyzing transcript: {str(e)}" | |
def process_transcript(transcript: str): | |
try: | |
analyzer = TranscriptAnalyzer() | |
analysis = analyzer.analyze_transcript(transcript) | |
return analysis | |
except Exception as e: | |
return f"Error processing transcript: {str(e)}" | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_transcript, | |
inputs=[ | |
gr.Textbox( | |
lines=10, | |
label="Enter Meeting Transcript", | |
placeholder="Paste your meeting transcript here..." | |
) | |
], | |
outputs=gr.Textbox( | |
label="Analysis Result", | |
lines=20 | |
), | |
title="Meeting Transcript Analyzer", | |
description="Analyze meeting transcripts to extract key information, dates, claims, and action items.", | |
examples=[ | |
["Meeting started on March 15, 2024 at 2:30 PM\nClaim #12345-ABC discussed regarding property damage\nJohn (Project Manager): Let's review the Q1 budget..."], | |
["Sarah (Team Lead): Good morning everyone. Today's meeting is about the new product launch.\nMike (Marketing): We're targeting April 1st, 2024 for the release.\nClaim #789-XYZ needs to be resolved before launch."] | |
] | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |