Spaces:

sainathBelagavi
/

transcript_summary

Sleeping

File size: 5,378 Bytes

# app.py
import gradio as gr
import json
import re
import os
from datetime import datetime
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login

# First, login with the Hugging Face token from secrets

try:
    hf_token = os.environ.get('HUGGINGFACE_TOKEN')
    if hf_token:
        login(token=hf_token)
    else:
        raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
except Exception as e:
    print(f"Error during Hugging Face login: {str(e)}")
    raise

class TranscriptAnalyzer:
    def __init__(self):
        try:
            # Initialize the model and tokenizer with auth token
            self.model_name = "microsoft/Phi-3.5-mini-instruct"
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                use_auth_token=hf_token,
                trust_remote_code=True
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                use_auth_token=hf_token,
                torch_dtype=torch.float16,
                device_map="auto",
                trust_remote_code=True
            )
        except Exception as e:
            print(f"Error initializing model: {str(e)}")
            raise


    def extract_dates(self, text: str):
        date_patterns = [
            r'\d{1,2}[-/]\d{1,2}[-/]\d{2,4}',
            r'\d{4}[-/]\d{1,2}[-/]\d{1,2}',
            r'\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'
        ]
        dates = []
        for pattern in date_patterns:
            matches = re.finditer(pattern, text)
            for match in matches:
                dates.append(match.group())
        return dates

    def extract_claim_numbers(self, text: str):
        claim_patterns = [
            r'claim\s+#?\s*\d+[-\w]*',
            r'#\s*\d+[-\w]*',
            r'case\s+#?\s*\d+[-\w]*'
        ]
        claims = []
        for pattern in claim_patterns:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                claims.append(match.group())
        return claims

    def generate_prompt(self, transcript: str):
        dates = self.extract_dates(transcript)
        claims = self.extract_claim_numbers(transcript)
        
        return f"""<s>[INST] Please analyze this meeting transcript with extreme precision and provide a structured analysis.
Remember to:
1. Only include information explicitly stated
2. Mark unclear information as "UNCLEAR"
3. Preserve exact numbers, dates, and claims
4. Focus on factual content

Identified dates: {', '.join(dates) if dates else 'None'}
Identified claims: {', '.join(claims) if claims else 'None'}

Please analyze:
{transcript}

Provide your analysis in this format:
PARTICIPANTS:
- List participants and their roles

CONTEXT:
- Meeting purpose
- Duration (if mentioned)

KEY POINTS:
- Main topics
- Decisions made
- Important numbers/metrics

ACTION ITEMS:
- Tasks and assignments
- Deadlines
- Responsible parties

FOLLOW UP:
- Next meetings
- Pending items [/INST]</s>"""

    def analyze_transcript(self, transcript: str):
        try:
            # Generate prompt
            prompt = self.generate_prompt(transcript)
            
            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            
            # Generate response
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=1000,
                    temperature=0.1,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id
                )
            
            # Decode response
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            
            # Extract the assistant's response (after the prompt)
            response = response.split("[/INST]")[-1].strip()
            
            return response
        except Exception as e:
            return f"Error analyzing transcript: {str(e)}"

def process_transcript(transcript: str):
    try:
        analyzer = TranscriptAnalyzer()
        analysis = analyzer.analyze_transcript(transcript)
        return analysis
    except Exception as e:
        return f"Error processing transcript: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_transcript,
    inputs=[
        gr.Textbox(
            lines=10,
            label="Enter Meeting Transcript",
            placeholder="Paste your meeting transcript here..."
        )
    ],
    outputs=gr.Textbox(
        label="Analysis Result",
        lines=20
    ),
    title="Meeting Transcript Analyzer",
    description="Analyze meeting transcripts to extract key information, dates, claims, and action items.",
    examples=[
        ["Meeting started on March 15, 2024 at 2:30 PM\nClaim #12345-ABC discussed regarding property damage\nJohn (Project Manager): Let's review the Q1 budget..."],
        ["Sarah (Team Lead): Good morning everyone. Today's meeting is about the new product launch.\nMike (Marketing): We're targeting April 1st, 2024 for the release.\nClaim #789-XYZ needs to be resolved before launch."]
    ]
)

# Launch the app
if __name__ == "__main__":
    iface.launch()