deekshith-rj commited on
Commit
85eaaaa
1 Parent(s): 483e2cf

PoC first release - no database update procedures included - just the app (+ direct dependencies) which uses the already generated databases - db_faiss and database.db

Browse files
.env ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ API_KEY=AIzaSyBA0cSPTDRsuan7M_rMiX0SqvAt-a35PJk
2
+ SECRET_KEY=DASNUEREHFDSFSDFDSE
3
+ ENVIRONMENT=DEVELOPMENT
4
+ GOOGLE_APPLICATION_CREDENTIALS=fact-check-ifcn-65173e5552e8.json
5
+ MODEL_PATH=models/ggml-model-q5_k_m.bin
6
+ CHROMA_DB_PATH=db_chroma
7
+ FAISS_DB_PATH=db_faiss
8
+ DB_PATH=database.db
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ database.db filter=lfs diff=lfs merge=lfs -text
37
+ db_faiss/index.faiss filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/*
2
+ .vscode/*
3
+ .idea/*
4
+
5
+ *.pyc
6
+
7
+ .env
8
+
9
+ #*.db
10
+ db_chroma
11
+ #db_faiss
12
+
13
+ #models/*
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.13.0
8
- app_file: app.py
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 4.13.0
8
+ app_file: app_gradio.py
9
  pinned: false
10
  ---
11
 
app_gradio.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from src.utils import get_rag_chain
4
+
5
+
6
+ rag = get_rag_chain()
7
+
8
+
9
+ # Write a function to process the RAG results
10
+ def query_fc(query):
11
+ # query = "Is Africa the youngest continent in the world?"
12
+ result = rag.invoke(query)
13
+ docs = [doc.metadata for doc in result['source_documents']]
14
+ df = pd.DataFrame(docs)
15
+
16
+ df.url = df.apply(lambda x: "<a href='{}'>{}</a>".format(x.url, x.title),
17
+ axis=1)
18
+ df['publisher'] = df.apply(lambda x: "<a href='https://{}'>{}</a>".
19
+ format(x.publisher_site, x.publisher_name), axis=1)
20
+ df.drop(columns=['language_code', 'title', 'claim_date', 'review_date',
21
+ 'publisher_site', 'publisher_name'], inplace=True)
22
+ df.rename(columns={'url': 'FC article', 'claim': 'Claim', 'publisher': 'FC Publisher',
23
+ 'claimant': 'Claimant', 'textual_rating': 'FC Rating'},
24
+ inplace=True)
25
+
26
+ # Reorder the columns in the DataFrame
27
+ column_order = ['Claim', 'FC Rating', 'FC article', 'FC Publisher', 'Claimant']
28
+ df = df.reindex(columns=column_order)
29
+
30
+ return (result['result'],
31
+ "<div style='max-width:100%; max-height:360px; overflow:auto'>"
32
+ + df.to_html(index=False, escape=False) + "</div>")
33
+
34
+
35
+ app = gr.Interface(
36
+ fn=query_fc,
37
+ inputs=gr.Textbox(placeholder="Enter your query here...", label='Query'),
38
+ outputs=[
39
+ gr.Textbox(label="Fact-check"),
40
+ gr.HTML(label="Source Documents")], # FIXME: the label is not showing
41
+ examples=[
42
+ ["Is Joe Biden offering motel stays to undocumented immigrants?"],
43
+ ["Did Justin Trudeau sits in protest in support of the protesting Indian farmers?"],
44
+ ])
45
+
46
+ if __name__ == "__main__":
47
+ app.launch()
database.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f31d15b7f83ee13d07b73b7a59d4bf59067866fb78e3796a4003e77504e4aa3f
3
+ size 33193984
db_faiss/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36983aba7c7a06f16346ca98eb8ef12a0cbc78a327a46e0b6bb67dc784b0e505
3
+ size 253243437
db_faiss/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11eaa06cd125eb24568010ae15ee400195cf9cc33f71363f9d268cedb9f923d7
3
+ size 56264524
models/ggml-model-q5_k_m.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf24ef596be9bc2a13f9edbd3c0ce3e8fe2d9a1a01329a49b42babe26b963d9a
3
+ size 4783156800
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ pandas
2
+ gradio
3
+ langchain
4
+ python-dotenv
5
+ sentence-transformers
6
+ llama-cpp-python
7
+ faiss-cpu
src/utils.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import random
2
+ # import sqlite3
3
+ # import time
4
+
5
+ # from googleapiclient.discovery import build
6
+ # from google.oauth2 import service_account
7
+ # from googleapiclient.errors import HttpError
8
+ # import pandas as pd
9
+ # import requests
10
+ # from bs4 import BeautifulSoup
11
+ # import pickle
12
+ # import tldextract
13
+
14
+ import os
15
+ from dotenv import load_dotenv
16
+
17
+ # from langchain.schema import Document
18
+ # from langchain.vectorstores.utils import DistanceStrategy
19
+ # from torch import cuda, bfloat16
20
+ # import torch
21
+ # import transformers
22
+ # from transformers import AutoTokenizer
23
+ # from langchain.document_loaders import TextLoader
24
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+ from langchain.llms import LlamaCpp
26
+ from langchain.vectorstores import FAISS
27
+ from langchain.embeddings import HuggingFaceEmbeddings
28
+ from langchain.chains import RetrievalQA # RetrievalQAWithSourcesChain
29
+
30
+ # from config import IFCN_LIST_URL
31
+
32
+ IFCN_FILENAME = os.path.join(os.path.dirname(os.path.dirname(__file__)),
33
+ 'ifcn_df.csv')
34
+
35
+ load_dotenv()
36
+ DB_PATH = os.getenv('DB_PATH')
37
+ FAISS_DB_PATH = os.getenv('FAISS_DB_PATH')
38
+ MODEL_PATH = os.getenv('MODEL_PATH')
39
+
40
+
41
+ # def get_claims(claims_serv, query_str, lang_code):
42
+ # """Queries the Google Fact Check API using the search string and returns the results
43
+
44
+ # Args:
45
+ # claims_serv (build().claims() object): build() creates a service object \
46
+ # for the factchecktools API; claims() creates a 'claims' object which \
47
+ # can be used to query with the search string
48
+ # query_str (str): the query string
49
+ # lang_code (str): BCP-47 language code, used to restrict search results by language
50
+
51
+ # Returns:
52
+ # list: the list of all search results returned by the API
53
+ # """
54
+ # claims = []
55
+ # req = claims_serv.search(query=query_str, languageCode=lang_code)
56
+ # try:
57
+ # res = req.execute()
58
+ # claims = res['claims'] # FIXME: is returning KeyError, perhaps when Google API is unresponsive
59
+ # except HttpError as e:
60
+ # print('Error response status code : {0}, reason : {1}'.format(e.status_code, e.error_details))
61
+
62
+ # # Aggregate all the results pages into one object
63
+ # while 'nextPageToken' in res.keys():
64
+ # req = claims_serv.search_next(req, res)
65
+ # res = req.execute()
66
+ # claims.extend(res['claims'])
67
+
68
+ # # TODO: Also return any basic useful metrics based on the results
69
+
70
+ # return claims
71
+
72
+
73
+ # def reformat_claims(claims):
74
+ # """Reformats the list of nested claims / search results into a DataFrame
75
+
76
+ # Args:
77
+ # claims (list): list of nested claims / search results
78
+
79
+ # Returns:
80
+ # pd.DataFrame: DataFrame containing search results, one per each row
81
+ # """
82
+ # # Format the results object into a format that is convenient to use
83
+ # df = pd.DataFrame(claims)
84
+ # df = df.explode('claimReview').reset_index(drop=True)
85
+ # claim_review_df = pd.json_normalize(df['claimReview'])
86
+ # return pd.concat([df.drop('claimReview', axis=1), claim_review_df], axis=1)
87
+
88
+
89
+ # def certify_claims(claims_df):
90
+ # """Certifies all the search results from the API against a list of verified IFCN signatories
91
+
92
+ # Args:
93
+ # claims_df (pd.DataFrame): DataFrame object containing all search results from the API
94
+
95
+ # Returns:
96
+ # pd.DataFrame: claims dataframe filtered to include only IFCN-certified claims
97
+ # """
98
+ # ifcn_to_use = get_ifcn_to_use()
99
+ # claims_df['ifcn_check'] = claims_df['publisher.site'].apply(remove_subdomain).isin(ifcn_to_use)
100
+ # return claims_df[claims_df['ifcn_check']].drop('ifcn_check', axis=1)
101
+
102
+
103
+ # def get_ifcn_data():
104
+ # """Standalone function to update the IFCN signatories CSV file that is stored locally"""
105
+ # r = requests.get(IFCN_LIST_URL)
106
+ # soup = BeautifulSoup(r.content, 'html.parser')
107
+ # cats_list = soup.find_all('div', class_='row mb-5')
108
+
109
+ # active = cats_list[0].find_all('div', class_='media')
110
+ # active = extract_ifcn_df(active, 'active')
111
+
112
+ # under_review = cats_list[1].find_all('div', class_='media')
113
+ # under_review = extract_ifcn_df(under_review, 'under_review')
114
+
115
+ # expired = cats_list[2].find_all('div', class_='media')
116
+ # expired = extract_ifcn_df(expired, 'expired')
117
+
118
+ # ifcn_df = pd.concat([active, under_review, expired], axis=0, ignore_index=True)
119
+ # ifcn_df['country'] = ifcn_df['country'].str.strip('from ')
120
+ # ifcn_df['verified_date'] = ifcn_df['verified_date'].str.strip('Verified on ')
121
+
122
+ # ifcn_df.to_csv(IFCN_FILENAME, index=False)
123
+
124
+
125
+ # def extract_ifcn_df(ifcn_list, status):
126
+ # """Returns useful info from a list of IFCN signatories
127
+
128
+ # Args:
129
+ # ifcn_list (list): list of IFCN signatories
130
+ # status (str): status code to be used for all signatories in this list
131
+
132
+ # Returns:
133
+ # pd.DataFrame: a dataframe of IFCN signatories' data
134
+ # """
135
+ # ifcn_data = [{
136
+ # 'url': x.a['href'],
137
+ # 'name': x.h5.text,
138
+ # 'country': x.h6.text,
139
+ # 'verified_date': x.find_all('span', class_='small')[1].text,
140
+ # 'ifcn_profile_url':
141
+ # x.find('a', class_='btn btn-sm btn-outline btn-link mb-0')['href'],
142
+ # 'status': status
143
+ # } for x in ifcn_list]
144
+ # return pd.DataFrame(ifcn_data)
145
+
146
+
147
+ # def remove_subdomain(url):
148
+ # """Removes the subdomain from a URL hostname - useful when comparing two URLs
149
+
150
+ # Args:
151
+ # url (str): URL hostname
152
+
153
+ # Returns:
154
+ # str: URL with subdomain removed
155
+ # """
156
+ # extract = tldextract.extract(url)
157
+ # return extract.domain + '.' + extract.suffix
158
+
159
+
160
+ # def get_ifcn_to_use():
161
+ # """Returns the IFCN data for non-expired signatories
162
+
163
+ # Returns:
164
+ # pd.Series: URls of non-expired IFCN signatories
165
+ # """
166
+ # ifcn_df = pd.read_csv(IFCN_FILENAME)
167
+ # ifcn_url = ifcn_df.loc[ifcn_df.status.isin(['active', 'under_review']), 'url']
168
+ # return [remove_subdomain(x) for x in ifcn_url]
169
+
170
+
171
+ # def get_gapi_service():
172
+ # """Returns a Google Fact-Check API-specific service object used to query the API
173
+
174
+ # Returns:
175
+ # googleapiclient.discovery.Resource: API-specific service object
176
+ # """
177
+ # load_dotenv()
178
+ # environment = os.getenv('ENVIRONMENT')
179
+ # if environment == 'DEVELOPMENT':
180
+ # api_key = os.getenv('API_KEY')
181
+ # service = build('factchecktools', 'v1alpha1', developerKey=api_key)
182
+ # elif environment == 'PRODUCTION':
183
+ # google_application_credentials = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
184
+ # # FIXME: The below credentials not working, the HTTP request throws HTTPError 400
185
+ # # credentials = service_account.Credentials.from_service_account_file(
186
+ # # GOOGLE_APPLICATION_CREDENTIALS)
187
+ # credentials = service_account.Credentials.from_service_account_file(
188
+ # google_application_credentials,
189
+ # scopes=['https://www.googleapis.com/auth/userinfo.email',
190
+ # 'https://www.googleapis.com/auth/cloud-platform'])
191
+ # service = build('factchecktools', 'v1alpha1', credentials=credentials)
192
+ # return service
193
+
194
+
195
+ # # USED IN update_database.py ----
196
+ # def get_claims_by_site(claims_serv, publisher_site, lang_code):
197
+ # # TODO: Any HTTP or other errors in this function need to be handled better
198
+ # req = claims_serv.search(reviewPublisherSiteFilter=publisher_site,
199
+ # languageCode=lang_code)
200
+ # while True:
201
+ # try:
202
+ # res = req.execute()
203
+ # break
204
+ # except HttpError as e:
205
+ # print('Error response status code : {0}, reason : {1}'.
206
+ # format(e.status_code, e.error_details))
207
+ # time.sleep(random.randint(50, 60))
208
+ # if 'claims' in res:
209
+ # claims = res['claims'] # FIXME: is returning KeyError when Google API is unresponsive?
210
+ # print('first 10')
211
+ # req_prev, req = req, None
212
+ # res_prev, res = res, None
213
+ # else:
214
+ # print('No data')
215
+ # return []
216
+
217
+ # # Aggregate all the results pages into one object
218
+ # while 'nextPageToken' in res_prev.keys():
219
+ # req = claims_serv.search_next(req_prev, res_prev)
220
+ # try:
221
+ # res = req.execute()
222
+ # claims.extend(res['claims'])
223
+ # req_prev, req = req, None
224
+ # res_prev, res = res, None
225
+ # print('another 10')
226
+ # except HttpError as e:
227
+ # print('Error in while loop : {0}, \
228
+ # reason : {1}'.format(e.status_code, e.error_details))
229
+ # time.sleep(random.randint(50, 60))
230
+
231
+ # return claims
232
+
233
+
234
+ # def rename_claim_attrs(df):
235
+ # return df.rename(
236
+ # columns={'claimDate': 'claim_date',
237
+ # 'reviewDate': 'review_date',
238
+ # 'textualRating': 'textual_rating',
239
+ # 'languageCode': 'language_code',
240
+ # 'publisher.name': 'publisher_name',
241
+ # 'publisher.site': 'publisher_site'}
242
+ # )
243
+
244
+
245
+ # def clean_claims(df):
246
+ # pass
247
+
248
+
249
+ # def write_claims_to_db(df):
250
+ # with sqlite3.connect(DB_PATH) as db_con:
251
+ # df.to_sql('claims', db_con, if_exists='append', index=False)
252
+ # # FIXME: The id variable is not getting auto-incremented
253
+
254
+
255
+ # def generate_and_store_embeddings(df, embed_model, overwrite):
256
+ # # TODO: Combine "text" & "textual_rating" to generate useful statements
257
+ # df['fact_check'] = 'The fact-check result for the claim "' + df['text'] \
258
+ # + '" is "' + df['textual_rating'] + '"'
259
+ # # TODO: Are ids required?
260
+
261
+ # df.rename(columns={'text': 'claim'}, inplace=True)
262
+ # docs = \
263
+ # [Document(page_content=row['fact_check'],
264
+ # metadata=row.drop('fact_check').to_dict())
265
+ # for idx, row in df.iterrows()]
266
+
267
+ # if overwrite == True:
268
+ # db = FAISS.from_documents(docs, embed_model, distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)
269
+ # # FIXME: MAX_INNER_PRODUCT is not being used currently, only EUCLIDEAN_DISTANCE
270
+ # db.save_local(FAISS_DB_PATH)
271
+ # elif overwrite == False:
272
+ # db = FAISS.load_local(FAISS_DB_PATH, embed_model, distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)
273
+ # db.add_documents(docs)
274
+ # db.save_local(FAISS_DB_PATH)
275
+
276
+
277
+ def get_rag_chain():
278
+ model_name = "sentence-transformers/all-mpnet-base-v2"
279
+ model_kwargs = {"device": "cpu"}
280
+ embed_model = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
281
+ llm = LlamaCpp(model_path=MODEL_PATH)
282
+
283
+ db_vector = FAISS.load_local(FAISS_DB_PATH, embed_model)
284
+ retriever = db_vector.as_retriever()
285
+
286
+ return RetrievalQA.from_chain_type(
287
+ llm=llm,
288
+ chain_type="stuff",
289
+ retriever=retriever,
290
+ return_source_documents=True,
291
+ verbose=True
292
+ )