Spaces:

WAQASCHANNA
/

Hackathon_Generative_AI

Sleeping

App Files Files Community

Hackathon_Generative_AI / app.py

WAQASCHANNA

Update app.py

80aa59b verified 5 months ago

raw

history blame

3.86 kB

	import streamlit as st
	from transformers import pipeline
	import chardet

	st.title("Legal Document Analysis")

	# Sidebar for uploading the document
	st.sidebar.header("Upload Document")
	uploaded_file = st.sidebar.file_uploader("Choose a document", type=["txt"])

	# Sidebar for selecting task
	st.sidebar.header("Select Task")
	task = st.sidebar.selectbox("Choose the task you want to perform:", ("Summarization", "Named Entity Recognition (NER)"))

	# Sidebar for setting summarization parameters (only shown if summarization is selected)
	if task == "Summarization":
	st.sidebar.header("Summarization Parameters")
	max_length = st.sidebar.slider("Max Length", min_value=50, max_value=500, value=150)
	min_length = st.sidebar.slider("Min Length", min_value=10, max_value=100, value=40)
	do_sample = st.sidebar.checkbox("Use Sampling", value=False)

	# Function to detect file encoding
	def detect_encoding(file):
	raw_data = file.read(1024) # Read a small chunk of the file
	result = chardet.detect(raw_data)
	encoding = result['encoding']
	return encoding

	# Function to split text into chunks
	def chunk_text(text, chunk_size=1000):
	return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

	# Function to classify text as law-related or not using zero-shot classification
	def classify_text(text):
	classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
	candidate_labels = ["law-related", "not law-related"]
	result = classifier(text[:512], candidate_labels=candidate_labels)
	return result['labels'][0] == "law-related"

	# Main area - Display content and perform tasks
	if uploaded_file is not None:
	try:
	# Detect and decode the file content
	encoding = detect_encoding(uploaded_file)
	if encoding is None:
	encoding = 'utf-8' # Fallback to default encoding

	uploaded_file.seek(0) # Reset file pointer to the beginning
	text = uploaded_file.read().decode(encoding)

	# Classify the text before proceeding with summarization or NER
	if classify_text(text):
	st.write("This document is classified as law-related.")

	# Chunk the text if it is too long
	chunks = chunk_text(text, chunk_size=1000)

	if task == "Summarization":
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
	summarized_text = ""

	# Summarize each chunk and combine the results
	for chunk in chunks:
	if len(chunk.split()) > min_length:
	summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=do_sample)
	summarized_text += summary[0]['summary_text'] + " "

	st.subheader("Summary:")
	st.write(summarized_text)

	elif task == "Named Entity Recognition (NER)":
	ner = pipeline("ner", grouped_entities=True, model="dslim/bert-base-NER")
	st.subheader("Named Entities:")

	for chunk in chunks:
	entities = ner(chunk)
	for entity in entities:
	st.write(f"{entity['entity_group']} - {entity['word']} (Score: {entity['score']:.2f})")
	else:
	st.warning("The uploaded document does not contain law-related content. Please upload a legal document.")

	except IndexError as e:
	st.error(f"IndexError: {e}. Ensure the text is long enough and parameters are set correctly.")

	except UnicodeDecodeError as e:
	st.error(f"Encoding error: {e}. Please upload a file with valid encoding.")

	except Exception as e:
	st.error(f"An unexpected error occurred: {e}")
	else:
	st.info("Please upload a document to analyze.")