model-pick / langchain_pipeline.py
anmolsahai's picture
word doc
1dd0ce9
raw
history blame
8.39 kB
import os
import logging
from pdf2docx import Converter
from io import BytesIO
import fitz # PyMuPDF
import re
from tempfile import NamedTemporaryFile
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI
from docx import Document
from docx.shared import Pt, RGBColor
from pdfminer import high_level
# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
ANTHROPIC_API_KEY = os.environ["ANTHROPIC_API_KEY"]
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")
models = {
"claude-3": ChatAnthropic(model='claude-3-sonnet-20240229'),
"gemini-pro": ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0.0)
}
def model_names():
return models.keys()
def sanitize_text(text):
return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
def extract_text_with_formatting(file_path):
document = fitz.open(file_path)
formatted_text = []
for page_num in range(document.page_count):
page = document[page_num]
blocks = page.get_text("dict")["blocks"]
for block in blocks:
if "lines" in block:
for line in block["lines"]:
for span in line["spans"]:
text = span["text"]
font_size = span["size"]
font_flags = span["flags"]
formatted_text.append({
"text": text,
"font_size": font_size,
"font_flags": font_flags,
})
return formatted_text
def convert_pdf_to_docx(pdf_path, output_path):
cv = Converter(pdf_path)
cv.convert(output_path, start=0, end=None)
cv.close()
print(f"PDF converted to Word document and saved to {output_path}")
def create_redlined_word_doc(formatted_text, changes, output_path):
doc = Document()
for item in formatted_text:
text = sanitize_text(item["text"])
font_size = item["font_size"]
font_flags = item["font_flags"]
paragraph = doc.add_paragraph()
run = paragraph.add_run(text)
run.font.size = Pt(font_size)
if font_flags & 2:
run.bold = True
if font_flags & 1:
run.italic = True
try:
changed_sentences, detailed_explanation = changes.split("2. **Detailed Explanation**:\n", 1)
changed_sentences = changed_sentences.replace("1. **Changed Sentences**:\n", "").strip()
detailed_explanation = detailed_explanation.strip()
logger.info(detailed_explanation)
logger.info(changed_sentences)
except ValueError:
logger.error("Model response does not contain the expected sections.")
raise ValueError("Invalid format: 'Changed Sentences' and 'Detailed Explanation' sections not found")
for line in changed_sentences.split('\n'):
if line.strip():
sanitized_line = sanitize_text(line)
while '[-' in sanitized_line and '-]' in sanitized_line:
start = sanitized_line.find('[-')
end = sanitized_line.find('-]', start)
deletion = sanitized_line[start+2:end]
sanitized_line = sanitized_line[:start] + sanitized_line[end+2:]
run = doc.add_paragraph().add_run(deletion)
run.font.color.rgb = RGBColor(255, 0, 0)
run.font.strike = True
while '[+' in sanitized_line and '+]' in sanitized_line:
start = sanitized_line.find('[+')
end = sanitized_line.find('+]', start)
addition = sanitized_line[start+2:end]
sanitized_line = sanitized_line[:start] + sanitized_line[end+2:]
run = doc.add_paragraph().add_run(addition)
run.font.color.rgb = RGBColor(0, 0, 255)
if sanitized_line.strip():
doc.add_paragraph(sanitized_line)
doc.add_page_break()
doc.add_heading('Detailed Explanation', level=1)
doc.add_paragraph(detailed_explanation)
doc.save(output_path)
print(f"Redlined document saved to {output_path}")
def pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
disclosure_text = high_level.extract_text(file)
prompt_template = """
law context:
20-cv-12061 08-23-2021
Veronica Gardner, Plaintiff, v. Flagstar Bank, FSB, Defendant.
GERSHWIN A. DRAIN, UNITED STATES DISTRICT JUDGE
OPINION AND ORDER GRANTING IN PART AND DENYING IN PART DEFENDANT'S MOTION TO DISMISS [#18]
GERSHWIN A. DRAIN, UNITED STATES DISTRICT JUDGE
I. Introduction
On July 31, 2020, Plaintiff Veronica Gardner brought the instant action on behalf of herself and all others similarly situated against Defendant Flagstar Bank, FSB (“Flagstar” or “Bank”). ECF No. 1. Plaintiff filed her First Amended Complaint on October 6, 2020 and alleges that Defendant unlawfully assesses and collects overdraft fees on transactions, sometimes multiple times, in violation of the contract between the parties. Id. Plaintiff brings two state law claims for breach of contract and conversion. Id.
Presently before the Court is Defendant's Motion to Dismiss. ECF No. 18. This matter is fully briefed. ECF Nos. 20, 23. Plaintiff also filed two Notices of Supplemental Authority. ECF Nos. 26,
Tims v. LGE Cmty. Credit Union 935 F.3d 1228 (11th Cir. 2019)
have considered the matter. See Salls v. Dig. Fed. Credit Union , 349 F. Supp. 3d 81, 91 (D. Mass 2018) (collecting cases).
Tims's complaint challenged the substance of LGE's Opt-In Agreement. Because the safe harbor does not protect financial institutions from challenges to the substance of Opt-In Agreements, Tims's EFTA claim survives a motion to dismiss, and the district court erred in granting the motion.
IV. CONCLUSION
For the foregoing reasons, we reverse the district court's order granting LGE's motion to dismiss and remand for further proceedings consistent with this opinion.
REVERSED AND REMANDED.
13
--------
above are several cases and a bank disclosure. Using the cases, please provide changes to the disclosure and keep as much formatting as possible and to ensure there are no legal contradictions between the content of the disclosure and the cases and please provide reasoning for each proposed change. Please also integrate the bank's policies into the disclosure. In the first sentence, please include a reference to the account agreement "for more information on overdrafts" and a placeholder for a URL. Available balance or ledger balance should replace money in the first sentence.
Here are the answers to the bank's policy questions:
Do you charge on available balance or ledger balance?: {balance_type}
Do you charge for APSN transactions?: {apsn_transactions}
How many overdraft fees per day can be charged?: {max_fees_per_day}
What is the minimum amount overdrawn to incur a fee?: {min_overdrawn_fee}
What is the minimum transaction amount to trigger an overdraft?: {min_transaction_overdraft}
Please output in the following format:
entire disclosure text updated with formatting retained
------
reasons for changes citing caselaw
"""
prompt = prompt_template.format(
disclosure=disclosure_text,
balance_type=balance_type,
apsn_transactions=apsn_transactions,
max_fees_per_day=max_fees_per_day,
min_overdrawn_fee=min_overdrawn_fee,
min_transaction_overdraft=min_transaction_overdraft
)
logger.debug(f"Formatted prompt: {prompt}")
chat_response = models[model_name].invoke(input=prompt)
changes = chat_response.content
logger.info("Model response:")
logger.info(changes)
output_directory = 'output'
output_path = os.path.join(output_directory, 'Redlined_Reg_E_Notice.docx')
if not os.path.exists(output_directory):
os.makedirs(output_directory)
formatted_text = extract_text_with_formatting(file)
create_redlined_word_doc(formatted_text, changes, output_path)
return output_path