# app.py import gradio as gr import json import re from datetime import datetime from transformers import AutoTokenizer, AutoModelForCausalLM import torch class TranscriptAnalyzer: def __init__(self): # Initialize the model and tokenizer self.model_name = "mistralai/Mistral-7B-Instruct-v0.2" self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=torch.float16, device_map="auto" ) def extract_dates(self, text: str): date_patterns = [ r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}', r'\d{4}[-/]\d{1,2}[-/]\d{1,2}', r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b' ] dates = [] for pattern in date_patterns: matches = re.finditer(pattern, text) for match in matches: dates.append(match.group()) return dates def extract_claim_numbers(self, text: str): claim_patterns = [ r'claim\s+#?\s*\d+[-\w]*', r'#\s*\d+[-\w]*', r'case\s+#?\s*\d+[-\w]*' ] claims = [] for pattern in claim_patterns: matches = re.finditer(pattern, text, re.IGNORECASE) for match in matches: claims.append(match.group()) return claims def generate_prompt(self, transcript: str): dates = self.extract_dates(transcript) claims = self.extract_claim_numbers(transcript) return f"""[INST] Please analyze this meeting transcript with extreme precision and provide a structured analysis. Remember to: 1. Only include information explicitly stated 2. Mark unclear information as "UNCLEAR" 3. Preserve exact numbers, dates, and claims 4. Focus on factual content Identified dates: {', '.join(dates) if dates else 'None'} Identified claims: {', '.join(claims) if claims else 'None'} Please analyze: {transcript} Provide your analysis in this format: PARTICIPANTS: - List participants and their roles CONTEXT: - Meeting purpose - Duration (if mentioned) KEY POINTS: - Main topics - Decisions made - Important numbers/metrics ACTION ITEMS: - Tasks and assignments - Deadlines - Responsible parties FOLLOW UP: - Next meetings - Pending items [/INST]""" def analyze_transcript(self, transcript: str): # Generate prompt prompt = self.generate_prompt(transcript) # Tokenize input inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) # Generate response with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=1000, temperature=0.1, do_sample=True, pad_token_id=self.tokenizer.eos_token_id ) # Decode response response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract the assistant's response (after the prompt) response = response.split("[/INST]")[-1].strip() return response def process_transcript(transcript: str): analyzer = TranscriptAnalyzer() analysis = analyzer.analyze_transcript(transcript) return analysis # Create Gradio interface iface = gr.Interface( fn=process_transcript, inputs=[ gr.Textbox( lines=10, label="Enter Meeting Transcript", placeholder="Paste your meeting transcript here..." ) ], outputs=gr.Textbox( label="Analysis Result", lines=20 ), title="Meeting Transcript Analyzer", description="Analyze meeting transcripts to extract key information, dates, claims, and action items.", examples=[ ["Meeting started on March 15, 2024 at 2:30 PM\nClaim #12345-ABC discussed regarding property damage\nJohn (Project Manager): Let's review the Q1 budget..."], ["Sarah (Team Lead): Good morning everyone. Today's meeting is about the new product launch.\nMike (Marketing): We're targeting April 1st, 2024 for the release.\nClaim #789-XYZ needs to be resolved before launch."] ] ) # Launch the app iface.launch()