Jayavathsan commited on
Commit
24d7c6d
·
1 Parent(s): 66a45e7

Create pages/admin_utils.py

Browse files
Files changed (1) hide show
  1. pages/admin_utils.py +77 -0
pages/admin_utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import OpenAIEmbeddings
4
+ from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
5
+ from langchain.llms import OpenAI
6
+ import pinecone
7
+ from langchain.vectorstores import Pinecone
8
+ import pandas as pd
9
+ from sklearn.model_selection import train_test_split
10
+
11
+
12
+
13
+
14
+ #**********Functions to help you load documents to PINECONE***********
15
+
16
+ #Read PDF data
17
+ def read_pdf_data(pdf_file):
18
+ pdf_page = PdfReader(pdf_file)
19
+ text = ""
20
+ for page in pdf_page.pages:
21
+ text += page.extract_text()
22
+ return text
23
+
24
+ #Split data into chunks
25
+ def split_data(text):
26
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
27
+ docs = text_splitter.split_text(text)
28
+ docs_chunks =text_splitter.create_documents(docs)
29
+ return docs_chunks
30
+
31
+ #Create embeddings instance
32
+ def create_embeddings_load_data():
33
+ #embeddings = OpenAIEmbeddings()
34
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
35
+ return embeddings
36
+
37
+ #Function to push data to Pinecone
38
+ def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
39
+
40
+ pinecone.init(
41
+ api_key=pinecone_apikey,
42
+ environment=pinecone_environment
43
+ )
44
+
45
+ index_name = pinecone_index_name
46
+ index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
47
+ return index
48
+
49
+ #*********Functions for dealing with Model related tasks...************
50
+
51
+ #Read dataset for model creation
52
+ def read_data(data):
53
+ df = pd.read_csv(data,delimiter=',', header=None)
54
+ return df
55
+
56
+ #Create embeddings instance
57
+ def get_embeddings():
58
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
59
+ return embeddings
60
+
61
+ #Generating embeddings for our input dataset
62
+ def create_embeddings(df,embeddings):
63
+ df[2] = df[0].apply(lambda x: embeddings.embed_query(x))
64
+ return df
65
+
66
+ #Splitting the data into train & test
67
+ def split_train_test__data(df_sample):
68
+ # Split into training and testing sets
69
+ sentences_train, sentences_test, labels_train, labels_test = train_test_split(
70
+ list(df_sample[2]), list(df_sample[1]), test_size=0.25, random_state=0)
71
+ print(len(sentences_train))
72
+ return sentences_train, sentences_test, labels_train, labels_test
73
+
74
+ #Get the accuracy score on test data
75
+ def get_score(svm_classifier,sentences_test,labels_test):
76
+ score = svm_classifier.score(sentences_test, labels_test)
77
+ return score