Hemasagar commited on
Commit
127478b
·
verified ·
1 Parent(s): 2dc3e0e

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +3 -34
utils.py CHANGED
@@ -1,19 +1,11 @@
1
- #As Langchain team has been working aggresively on improving the tool, we can see a lot of changes happening every weeek,
2
- #As a part of it, the below import has been depreciated
3
- #from langchain.llms import OpenAI
4
  from langchain_openai import OpenAI
5
-
6
  from pypdf import PdfReader
7
- #from langchain.llms.openai import OpenAI
8
  import pandas as pd
9
  import re
10
- # import replicate
11
  from langchain.prompts import PromptTemplate
12
  from langchain_community.llms import CTransformers
13
  from ctransformers import AutoModelForCausalLM
14
 
15
-
16
-
17
  #Extract Information from PDF file
18
  def get_pdf_text(pdf_doc):
19
  text = ""
@@ -21,10 +13,6 @@ def get_pdf_text(pdf_doc):
21
  for page in pdf_reader.pages:
22
  text += page.extract_text()
23
  return text
24
-
25
- # filename = r"/Invoice_Extraction_Bot/Invoice/invoice_1001329.pdf"
26
-
27
- # raw_data=get_pdf_text(filename)
28
  #Function to extract data from text...
29
  def extracted_data(pages_data):
30
  template = """Please Extract all the following values : invoice no., Description, Quantity, date,
@@ -33,33 +21,14 @@ def extracted_data(pages_data):
33
  Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
34
  """
35
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
36
-
37
- # llm = OpenAI(temperature=.7)
38
- # full_response=llm(prompt_template.format(pages=pages_data))
39
-
40
-
41
- #The below code will be used when we want to use LLAMA 2 model, we will use Replicate for hosting our model....
42
-
43
- # output = CTransformers(model=r"TheBloke/llama-2-7b-chat.ggmlv3.q8_0.bin", #https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGML/tree/main
44
- # model_type='llama',
45
- # input={"prompt":prompt_template.format(pages=pages_data) ,
46
- # "temperature":0.1, "top_p":0.9, "max_length":512, "repetition_penalty":1})
47
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
48
  output_text=llm(prompt_template.format(pages=pages_data))
49
 
50
  full_response = ''
51
  for item in output_text:
52
  full_response += item
53
-
54
-
55
- #print(full_response)
56
  return full_response
57
-
58
- #print(raw_data)
59
- # print("extracted raw data")
60
- # llm_extracted_data=extracted_data(raw_data)
61
- #print(llm_extracted_data)
62
-
63
  # iterate over files in
64
  # that user uploaded PDF files, one by one
65
  def create_docs(user_pdf_list):
@@ -108,9 +77,9 @@ def create_docs(user_pdf_list):
108
  data_dict = {}
109
 
110
 
111
- df=df._append([data_dict], ignore_index=True)
112
  print("********************DONE***************")
113
- #df=df.append(save_to_dataframe(llm_extracted_data), ignore_index=True)
114
 
115
  df.head()
116
  return df
 
 
 
 
1
  from langchain_openai import OpenAI
 
2
  from pypdf import PdfReader
 
3
  import pandas as pd
4
  import re
 
5
  from langchain.prompts import PromptTemplate
6
  from langchain_community.llms import CTransformers
7
  from ctransformers import AutoModelForCausalLM
8
 
 
 
9
  #Extract Information from PDF file
10
  def get_pdf_text(pdf_doc):
11
  text = ""
 
13
  for page in pdf_reader.pages:
14
  text += page.extract_text()
15
  return text
 
 
 
 
16
  #Function to extract data from text...
17
  def extracted_data(pages_data):
18
  template = """Please Extract all the following values : invoice no., Description, Quantity, date,
 
21
  Expected output: remove any dollar symbols {{'Invoice no.': '1001329','Description': 'Office Chair','Quantity': '2','Date': '5/4/2023','Unit price': '1100.00$','Amount': '2200.00$','Total': '2200.00$','Email': '[email protected]','Phone number': '9999999999','Address': 'Mumbai, India'}}
22
  """
23
  prompt_template = PromptTemplate(input_variables=["pages"], template=template)
 
 
 
 
 
 
 
 
 
 
 
24
  llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7B-Chat-GGML", model_file="llama-2-7b-chat.ggmlv3.q8_0.bin")
25
  output_text=llm(prompt_template.format(pages=pages_data))
26
 
27
  full_response = ''
28
  for item in output_text:
29
  full_response += item
 
 
 
30
  return full_response
31
+
 
 
 
 
 
32
  # iterate over files in
33
  # that user uploaded PDF files, one by one
34
  def create_docs(user_pdf_list):
 
77
  data_dict = {}
78
 
79
 
80
+ df=df.append([data_dict], ignore_index=True)
81
  print("********************DONE***************")
82
+ # df=df.append(save_to_dataframe(llm_extracted_data), ignore_index=True)
83
 
84
  df.head()
85
  return df