Spaces:

Hemasagar
/

Pdf-to-csv-audio-to-text

Sleeping

App Files Files Community

Hemasagar commited on May 30, 2024

Commit

127478b

verified ·

1 Parent(s): 2dc3e0e

Update utils.py

Browse files

Files changed (1) hide show

utils.py +3 -34

utils.py CHANGED Viewed

@@ -1,19 +1,11 @@
-#As Langchain team has been working aggresively on improving the tool, we can see a lot of changes happening every weeek,
-#As a part of it, the below import has been depreciated
-#from langchain.llms import OpenAI
 from langchain_openai import OpenAI
 from pypdf import PdfReader
-#from langchain.llms.openai import OpenAI
 import pandas as pd
 import re
-# import replicate
 from langchain.prompts import PromptTemplate
 from langchain_community.llms import CTransformers
 from ctransformers import AutoModelForCausalLM
 #Extract Information from PDF file
 def get_pdf_text(pdf_doc):
     text = ""
@@ -21,10 +13,6 @@ def get_pdf_text(pdf_doc):
     for page in pdf_reader.pages:
         text += page.extract_text()
     return text
-# filename = r"/Invoice_Extraction_Bot/Invoice/invoice_1001329.pdf"
-# raw_data=get_pdf_text(filename)
 #Function to extract data from text...
 def extracted_data(pages_data):
     template = """Please Extract all the following values : invoice no., Description, Quantity, date,
@@ -33,33 +21,14 @@ def extracted_data(pages_data):
         Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
         """
     prompt_template = PromptTemplate(input_variables=["pages"], template=template)
-    # llm = OpenAI(temperature=.7)
-    # full_response=llm(prompt_template.format(pages=pages_data))
-    #The below code will be used when we want to use LLAMA 2 model,  we will use Replicate for hosting our model....
-    # output = CTransformers(model=r"TheBloke/llama-2-7b-chat.ggmlv3.q8_0.bin",     #https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main
-    #                 model_type='llama',
-    #                     input={"prompt":prompt_template.format(pages=pages_data) ,
-    #                                 "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
     llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
     output_text=llm(prompt_template.format(pages=pages_data))
     full_response = ''
     for item in output_text:
         full_response += item
-        #print(full_response)
     return full_response
-#print(raw_data)
-# print("extracted raw  data")
-# llm_extracted_data=extracted_data(raw_data)
-        #print(llm_extracted_data)
 # iterate over files in
 # that user uploaded PDF files, one by one
 def create_docs(user_pdf_list):
@@ -108,9 +77,9 @@ def create_docs(user_pdf_list):
             data_dict = {}
-        df=df._append([data_dict], ignore_index=True)
         print("********************DONE***************")
-        #df=df.append(save_to_dataframe(llm_extracted_data), ignore_index=True)
     df.head()
     return df

 from langchain_openai import OpenAI
 from pypdf import PdfReader
 import pandas as pd
 import re
 from langchain.prompts import PromptTemplate
 from langchain_community.llms import CTransformers
 from ctransformers import AutoModelForCausalLM
 #Extract Information from PDF file
 def get_pdf_text(pdf_doc):
     text = ""
     for page in pdf_reader.pages:
         text += page.extract_text()
     return text
 #Function to extract data from text...
 def extracted_data(pages_data):
     template = """Please Extract all the following values : invoice no., Description, Quantity, date,
         Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
         """
     prompt_template = PromptTemplate(input_variables=["pages"], template=template)
     llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
     output_text=llm(prompt_template.format(pages=pages_data))
     full_response = ''
     for item in output_text:
         full_response += item
     return full_response
 # iterate over files in
 # that user uploaded PDF files, one by one
 def create_docs(user_pdf_list):
             data_dict = {}
+        df=df.append([data_dict], ignore_index=True)
         print("********************DONE***************")
+        # df=df.append(save_to_dataframe(llm_extracted_data), ignore_index=True)
     df.head()
     return df