Spaces:

MarineLives
/

MarineLives-Legal-Assistant

Build error

App Files Files Community

Addaci commited on 27 days ago

Commit

40c679d

•

1 Parent(s): 251d490

Update app.py (major changes)

Browse files

1.0 Added input and output string limitations
2.0 Added skip_special_tokens
3.0 Simplified input format
3.1 For summarization, we're using the "summarize: " prefix, as mT5 is generally pre-trained with a task prefix like this.
3.2 For question-answering, we're combining the question and context in a way that aligns with the model's pre-training structure.
4. Improving Output Quality:
4.1 Beam search (num_beams=4) improves the diversity of generated sequences and often leads to better, more coherent results.
4.2 Early stopping prevents the model from generating overly long or repetitive sequences.
5.0 Model Performance Expectations:
5.1 Since mT5-small was not specifically fine-tuned for legal tasks (e.g., summarization of legal documents or answering legal questions), the pre-trained model might struggle with domain-specific terminology. 5.2 You might get better results by fine-tuning the model on a small subset of legal texts if the performance is unsatisfactory.

Files changed (1) hide show

app.py +27 -39

app.py CHANGED Viewed

@@ -9,60 +9,48 @@ model = T5ForConditionalGeneration.from_pretrained(model_name)
 def correct_htr(raw_htr_text):
     # Tokenize the input text
-    inputs = tokenizer(raw_htr_text, return_tensors="pt")
-    # Generate corrected text
-    outputs = model.generate(**inputs)
     corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return corrected_text
 def summarize_text(legal_text):
-    # Tokenize the input text with summarization prompt
-    inputs = tokenizer("summarize: " + legal_text, return_tensors="pt")
-    # Generate summary
-    outputs = model.generate(**inputs)
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return summary
 def answer_question(legal_text, question):
-    # Combine context and question
-    inputs = tokenizer(f"question: {question} context: {legal_text}", return_tensors="pt")
-    # Generate answer
-    outputs = model.generate(**inputs)
     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return answer
-# Create the Gradio Blocks interface
-with gr.Blocks() as demo:
-    gr.Markdown("# mT5 Legal Assistant")
-    gr.Markdown("Use this tool to correct raw HTR, summarize legal texts, or answer questions about legal cases.")
-    with gr.Tab("Correct HTR"):
-        gr.Markdown("### Correct Raw HTR Text")
-        raw_htr_input = gr.Textbox(lines=5, placeholder="Enter raw HTR text here...")
-        corrected_output = gr.Textbox(lines=5, placeholder="Corrected HTR text")
-        correct_button = gr.Button("Correct HTR")
-        correct_button.click(correct_htr, inputs=raw_htr_input, outputs=corrected_output)
-    with gr.Tab("Summarize Legal Text"):
-        gr.Markdown("### Summarize Legal Text")
-        legal_text_input = gr.Textbox(lines=10, placeholder="Enter legal text to summarize...")
-        summary_output = gr.Textbox(lines=5, placeholder="Summary of legal text")
-        summarize_button = gr.Button("Summarize Text")
-        summarize_button.click(summarize_text, inputs=legal_text_input, outputs=summary_output)
-    with gr.Tab("Answer Legal Question"):
-        gr.Markdown("### Answer a Question Based on Legal Text")
-        legal_text_input_q = gr.Textbox(lines=10, placeholder="Enter legal text...")
-        question_input = gr.Textbox(lines=2, placeholder="Enter your question...")
-        answer_output = gr.Textbox(lines=5, placeholder="Answer to your question")
-        answer_button = gr.Button("Get Answer")
-        answer_button.click(answer_question, inputs=[legal_text_input_q, question_input], outputs=answer_output)
-demo.launch()

 def correct_htr(raw_htr_text):
     # Tokenize the input text
+    inputs = tokenizer(raw_htr_text, return_tensors="pt", max_length=512, truncation=True)
+    print("Tokenized Inputs for HTR Correction:", inputs)  # Debugging
+    # Generate corrected text with max_length and beam search
+    outputs = model.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
+    print("Generated Output (Tokens) for HTR Correction:", outputs)  # Debugging
+    # Decode the output, skipping special tokens
     corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print("Decoded Output for HTR Correction:", corrected_text)  # Debugging
     return corrected_text
 def summarize_text(legal_text):
+    # Tokenize the input text with the summarization prompt
+    inputs = tokenizer("summarize: " + legal_text, return_tensors="pt", max_length=512, truncation=True)
+    print("Tokenized Inputs for Summarization:", inputs)  # Debugging
+    # Generate summary with beam search for better results
+    outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
+    print("Generated Summary (Tokens):", outputs)  # Debugging
+    # Decode the output, skipping special tokens
     summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print("Decoded Summary:", summary)  # Debugging
     return summary
 def answer_question(legal_text, question):
+    # Format input for question-answering
+    formatted_input = f"question: {question} context: {legal_text}"
+    inputs = tokenizer(formatted_input, return_tensors="pt", max_length=512, truncation=True)
+    print("Tokenized Inputs for Question Answering:", inputs)  # Debugging
+    # Generate answer using beam search
+    outputs = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
+    print("Generated Answer (Tokens):", outputs)  # Debugging
+    # Decode the output, skipping special tokens
     answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    print("Decoded Answer:", answer)  # Debugging
     return answer