File size: 4,281 Bytes
ccd1b2a
 
b6b0279
ccd1b2a
 
8076129
b6b0279
ccd1b2a
 
 
 
 
 
 
 
 
 
8076129
 
97ce7f7
8076129
3cd21bd
8076129
 
 
ccd1b2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfc5da5
424939b
cfc5da5
 
 
 
 
 
 
 
 
 
 
85c9be6
cfc5da5
 
85c9be6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import warnings
warnings.filterwarnings('ignore')

import os
import gradio as gr

import torch
import tempfile
import numpy as np
import cohere
import spacy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text
from nltk.tokenize.texttiling import TextTilingTokenizer
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')

# Download spaCy model
spacy.cli.download("en_core_web_sm")

co = cohere.Client(os.environ.get("CO_API_KEY"))

nlp = spacy.load("en_core_web_sm")

from transformers import AutoTokenizer, AutoModel

# Load models
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")

# Initialize TextTilingTokenizer with default parameters
tiling_tokenizer = TextTilingTokenizer()

def generate_response(prompt, embeddings):
    aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
    embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
    
    full_prompt = f"{embedding_str}\n\n{prompt}"
    
    try:
        response = co.generate(
            model="command-xlarge-nightly",
            prompt=full_prompt,
            max_tokens=750  # Increase the max tokens for a longer response
        )
        return response.generations[0].text.strip()
    
    except cohere.error.CohereError as e:
        return f"An error occurred: {str(e)}"

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def get_bert_embeddings(texts):
    embeddings_list = []

    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings_list.append(embeddings)

    return embeddings_list

def analyze_text(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    return entities, pos_tags, dependencies

def process_pdf_and_generate_response(pdf_file, query):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        with open(pdf_file, 'rb') as f:
            temp_file.write(f.read())
            temp_file_path = temp_file.name

    document_text = extract_text_from_pdf(temp_file_path)

    entities, pos_tags, dependencies = analyze_text(document_text)
    
    print("Entities:", entities)
    print("POS Tags:", pos_tags)
    print("Dependencies:", dependencies)
    
    # Segment the document text using TextTiling
    text_chunks = tiling_tokenizer.tokenize(document_text)
    
    # Process document text with InLegalBERT
    document_embeddings = get_bert_embeddings(text_chunks)
    
    # Construct prompt for LLM
    prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
    
    # Generate response using LLM
    response = generate_response(prompt, document_embeddings)
    
    return response

def chunk_long_sentence(sentence):
    words = sentence.split()
    chunks = []
    current_chunk = []
    
    for word in words:
        if len(' '.join(current_chunk + [word])) <= 512:
            current_chunk.append(word)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks
    
iface = gr.Interface(
    fn=process_pdf_and_generate_response,
    inputs=[
        gr.File(label="Upload PDF Document"), 
        gr.Textbox(label="Query", placeholder="Enter your query here...")
    ],
    outputs=gr.Textbox(label="Response"),
    title="AI-Driven Research Engine for Commercial Courts",
    description="Upload a PDF document and ask a question to get a response generated based on the content of the document."
)

# Launch the interface

    
    
iface.launch(share=True)