File size: 4,281 Bytes
ccd1b2a b6b0279 ccd1b2a 8076129 b6b0279 ccd1b2a 8076129 97ce7f7 8076129 3cd21bd 8076129 ccd1b2a cfc5da5 424939b cfc5da5 85c9be6 cfc5da5 85c9be6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import warnings
import os
import gradio as gr
import torch
import tempfile
import numpy as np
import cohere
import spacy
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from pdfminer.high_level import extract_text
from nltk.tokenize.texttiling import TextTilingTokenizer
# Download necessary NLTK data'punkt')'punkt_tab')'stopwords')'averaged_perceptron_tagger_eng')
# Download spaCy model"en_core_web_sm")
co = cohere.Client(os.environ.get("CO_API_KEY"))
nlp = spacy.load("en_core_web_sm")
from transformers import AutoTokenizer, AutoModel
# Load models
tokenizer = AutoTokenizer.from_pretrained("law-ai/InLegalBERT")
model = AutoModel.from_pretrained("law-ai/InLegalBERT")
# Initialize TextTilingTokenizer with default parameters
tiling_tokenizer = TextTilingTokenizer()
def generate_response(prompt, embeddings):
aggregated_embedding = np.mean([np.mean(embed) for embed in embeddings])
embedding_str = f"Embedding summary: {aggregated_embedding:.2f}"
full_prompt = f"{embedding_str}\n\n{prompt}"
response = co.generate(
max_tokens=750 # Increase the max tokens for a longer response
return response.generations[0].text.strip()
except cohere.error.CohereError as e:
return f"An error occurred: {str(e)}"
def extract_text_from_pdf(pdf_path):
return extract_text(pdf_path)
def get_bert_embeddings(texts):
embeddings_list = []
for text in texts:
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
return embeddings_list
def analyze_text(text):
doc = nlp(text)
entities = [(ent.text, ent.label_) for ent in doc.ents]
tokens = word_tokenize(text)
pos_tags = pos_tag(tokens)
dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
return entities, pos_tags, dependencies
def process_pdf_and_generate_response(pdf_file, query):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
with open(pdf_file, 'rb') as f:
temp_file_path =
document_text = extract_text_from_pdf(temp_file_path)
entities, pos_tags, dependencies = analyze_text(document_text)
print("Entities:", entities)
print("POS Tags:", pos_tags)
print("Dependencies:", dependencies)
# Segment the document text using TextTiling
text_chunks = tiling_tokenizer.tokenize(document_text)
# Process document text with InLegalBERT
document_embeddings = get_bert_embeddings(text_chunks)
# Construct prompt for LLM
prompt = f"You are an AI driven research engine for commercial courts, Given the legal document: '{document_text[:2000]}', answer the query : '{query}'"
# Generate response using LLM
response = generate_response(prompt, document_embeddings)
return response
def chunk_long_sentence(sentence):
words = sentence.split()
chunks = []
current_chunk = []
for word in words:
if len(' '.join(current_chunk + [word])) <= 512:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
iface = gr.Interface(
gr.File(label="Upload PDF Document"),
gr.Textbox(label="Query", placeholder="Enter your query here...")
title="AI-Driven Research Engine for Commercial Courts",
description="Upload a PDF document and ask a question to get a response generated based on the content of the document."
# Launch the interface