Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import pipeline | |
import chardet | |
st.title("Legal Document Analysis") | |
# Sidebar for uploading the document | |
st.sidebar.header("Upload Document") | |
uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"]) | |
# Sidebar for selecting task | |
st.sidebar.header("Select Task") | |
task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)")) | |
# Sidebar for setting summarization parameters (only shown if summarization is selected) | |
if task == "Summarization": | |
st.sidebar.header("Summarization Parameters") | |
max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150) | |
min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40) | |
do_sample = st.sidebar.checkbox("Use Sampling", value=False) | |
# Function to detect file encoding | |
def detect_encoding(file): | |
raw_data = file.read(1024) # Read a small chunk of the file | |
result = chardet.detect(raw_data) | |
encoding = result['encoding'] | |
return encoding | |
# Function to split text into chunks | |
def chunk_text(text, chunk_size=1000): | |
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] | |
# Function to classify text as law-related or not using zero-shot classification | |
def classify_text(text): | |
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") | |
candidate_labels = ["law-related", "not law-related"] | |
result = classifier(text[:512], candidate_labels=candidate_labels) | |
return result['labels'][0] == "law-related" | |
# Main area - Display content and perform tasks | |
if uploaded_file is not None: | |
try: | |
# Detect and decode the file content | |
encoding = detect_encoding(uploaded_file) | |
if encoding is None: | |
encoding = 'utf-8' # Fallback to default encoding | |
uploaded_file.seek(0) # Reset file pointer to the beginning | |
text = uploaded_file.read().decode(encoding) | |
# Classify the text before proceeding with summarization or NER | |
if classify_text(text): | |
st.write("This document is classified as law-related.") | |
# Chunk the text if it is too long | |
chunks = chunk_text(text, chunk_size=1000) | |
if task == "Summarization": | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
summarized_text = "" | |
# Summarize each chunk and combine the results | |
for chunk in chunks: | |
if len(chunk.split()) > min_length: | |
summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample) | |
summarized_text += summary[0]['summary_text'] + " " | |
st.subheader("Summary:") | |
st.write(summarized_text) | |
elif task == "Named Entity Recognition (NER)": | |
ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER") | |
st.subheader("Named Entities:") | |
for chunk in chunks: | |
entities = ner(chunk) | |
for entity in entities: | |
st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})") | |
else: | |
st.warning("The uploaded document does not contain law-related content. Please upload a legal document.") | |
except IndexError as e: | |
st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.") | |
except UnicodeDecodeError as e: | |
st.error(f"Encoding error: {e}. Please upload a file with valid encoding.") | |
except Exception as e: | |
st.error(f"An unexpected error occurred: {e}") | |
else: | |
st.info("Please upload a document to analyze.") | |