Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -18,7 +18,8 @@ import re
|
|
18 |
from PyPDF2 import PdfReader
|
19 |
|
20 |
#tempat vectordb
|
21 |
-
|
|
|
22 |
|
23 |
#embeddings
|
24 |
embeddings = OpenAIEmbeddings()
|
@@ -88,7 +89,7 @@ def get_text_chunks(text):
|
|
88 |
def get_vectorstore(text_chunks):
|
89 |
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
90 |
# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
91 |
-
vectorstore = Chroma(persist_directory=
|
92 |
return vectorstore
|
93 |
|
94 |
|
@@ -181,25 +182,25 @@ def main():
|
|
181 |
if st.button("Re-Processing New Data"):
|
182 |
with st.spinner("Processing..."):
|
183 |
# BERITA
|
184 |
-
#
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
#
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
#
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
|
204 |
# directory ke pdf regulasi
|
205 |
folder_path = 'pdf/'
|
@@ -235,7 +236,6 @@ def main():
|
|
235 |
print("splitting final text berhasil")
|
236 |
|
237 |
#save dengan chroma
|
238 |
-
dirsave = "cumandoc"
|
239 |
vectorstore = Chroma.from_texts(texts,
|
240 |
embeddings,
|
241 |
persist_directory=dirsave)
|
|
|
18 |
from PyPDF2 import PdfReader
|
19 |
|
20 |
#tempat vectordb
|
21 |
+
dirload = '24feb24-openaiv2'
|
22 |
+
dirsave = "terbaru"
|
23 |
|
24 |
#embeddings
|
25 |
embeddings = OpenAIEmbeddings()
|
|
|
89 |
def get_vectorstore(text_chunks):
|
90 |
# embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
|
91 |
# vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
|
92 |
+
vectorstore = Chroma(persist_directory=dirload, embedding_function=embeddings)
|
93 |
return vectorstore
|
94 |
|
95 |
|
|
|
182 |
if st.button("Re-Processing New Data"):
|
183 |
with st.spinner("Processing..."):
|
184 |
# BERITA
|
185 |
+
# Find a CSV files in the directory
|
186 |
+
sumber = glob.glob("berita/*.csv")
|
187 |
+
df = pd.read_csv(sumber[0])
|
188 |
+
banyakBerita = len(df)
|
189 |
+
print("sumber berita ditemukan")
|
190 |
+
|
191 |
+
#update banyak berita txt
|
192 |
+
with open("banyakBerita.txt", "w") as file:
|
193 |
+
file.write(str(banyakBerita))
|
194 |
+
print("update file text berita berhasil")
|
195 |
+
|
196 |
+
#combining and converting
|
197 |
+
df["combined"] = ""
|
198 |
+
for row in range(len(df)):
|
199 |
+
kombinasi = "berita ke-" + str(row+1) + " \n " + "judul: " + str(df['title'].loc[row]) + " \n " + "link: "+ str(df['url'].loc[row]) + " \n " + "tanggal rilis: " + str(df['datetime'].loc[row]) + " \n " + "penulis: " + str(df['author'].loc[row]) + " \n " + "isi berita: " + str(df['text'].loc[row]) + " \n " + "sumber: " + str(df['source'].loc[row]) + " \n "
|
200 |
+
df['combined'].loc[row] = kombinasi
|
201 |
+
listberita = df["combined"].tolist()
|
202 |
+
textberita = " ".join(listberita)
|
203 |
+
print("combining and converting berhasil")
|
204 |
|
205 |
# directory ke pdf regulasi
|
206 |
folder_path = 'pdf/'
|
|
|
236 |
print("splitting final text berhasil")
|
237 |
|
238 |
#save dengan chroma
|
|
|
239 |
vectorstore = Chroma.from_texts(texts,
|
240 |
embeddings,
|
241 |
persist_directory=dirsave)
|