Spaces:

Zayed024
/

AI-driven-research-engine-for-commercial-courts

Build error

AI-driven-research-engine-for-commercial-courts

File size: 4,281 Bytes

import warnings
warnings.filterwarnings('ignore')

import os
import gradio as gr

import torch
import tempfile
import numpy as np
import cohere
import spacy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text
from nltk.tokenize.texttiling import TextTilingTokenizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# Download spaCy model
spacy.cli.download("en_core_web_sm")

co = cohere.Client(os.environ.get("CO_API_KEY"))

nlp = spacy.load("en_core_web_sm")

from transformers import AutoTokenizer, AutoModel

# Load models
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

# Initialize TextTilingTokenizer with default parameters
tiling_tokenizer = TextTilingTokenizer()

def generate_response(prompt, embeddings):
    aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
    embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
    
    full_prompt = f"{embedding_str}\n\n{prompt}"
    
    try:
        response = co.generate(
            model="command-xlarge-nightly",
            prompt=full_prompt,
            max_tokens=750  # Increase the max tokens for a longer response
        )
        return response.generations[0].text.strip()
    
    except cohere.error.CohereError as e:
        return f"An error occurred: {str(e)}"

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def get_bert_embeddings(texts):
    embeddings_list = []

    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings_list.append(embeddings)

    return embeddings_list

def analyze_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return entities, pos_tags, dependencies

def process_pdf_and_generate_response(pdf_file, query):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        with open(pdf_file, 'rb') as f:
            temp_file.write(f.read())
            temp_file_path = temp_file.name

    document_text = extract_text_from_pdf(temp_file_path)

    entities, pos_tags, dependencies = analyze_text(document_text)
    
    print("Entities:", entities)
    print("POS Tags:", pos_tags)
    print("Dependencies:", dependencies)
    
    # Segment the document text using TextTiling
    text_chunks = tiling_tokenizer.tokenize(document_text)
    
    # Process document text with InLegalBERT
    document_embeddings = get_bert_embeddings(text_chunks)
    
    # Construct prompt for LLM
    prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
    
    # Generate response using LLM
    response = generate_response(prompt, document_embeddings)
    
    return response

def chunk_long_sentence(sentence):
    words = sentence.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        if len(' '.join(current_chunk + [word])) <= 512:
            current_chunk.append(word)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks
    
iface = gr.Interface(
    fn=process_pdf_and_generate_response,
    inputs=[
        gr.File(label="Upload PDF Document"), 
        gr.Textbox(label="Query", placeholder="Enter your query here...")
    ],
    outputs=gr.Textbox(label="Response"),
    title="AI-Driven Research Engine for Commercial Courts",
    description="Upload a PDF document and ask a question to get a response generated based on the content of the document."
)

# Launch the interface

    
    
iface.launch(share=True)