Spaces:
Runtime error
Runtime error
Add AWS files
Browse files- main.py +57 -2
- requirements.txt +2 -1
main.py
CHANGED
@@ -7,6 +7,8 @@ import json
|
|
7 |
import faiss
|
8 |
import nest_asyncio
|
9 |
import sys
|
|
|
|
|
10 |
from pathlib import Path
|
11 |
from bs4 import BeautifulSoup
|
12 |
from typing import Union, List
|
@@ -47,7 +49,55 @@ Settings.context_window = 20000
|
|
47 |
Settings.chunk_size = 2048
|
48 |
Settings.similarity_top_k = 20
|
49 |
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
# Apply nest_asyncio to handle nested async calls
|
53 |
nest_asyncio.apply()
|
@@ -135,7 +185,8 @@ def get_links_html_lp(lp_ids):
|
|
135 |
|
136 |
def initialize_components():
|
137 |
try:
|
138 |
-
persist_path = Path(PERSIST_DIR)
|
|
|
139 |
|
140 |
if not persist_path.exists():
|
141 |
raise FileNotFoundError(f"Directory not found: {persist_path}")
|
@@ -149,9 +200,13 @@ def initialize_components():
|
|
149 |
|
150 |
global retriever_bm25
|
151 |
|
|
|
152 |
docstore = SimpleDocumentStore.from_persist_path(str(persist_path / "docstore_es_filter.json"))
|
153 |
bm25_retriever = BM25Retriever.from_persist_dir(str(persist_path / "bm25_retriever_es"))
|
154 |
|
|
|
|
|
|
|
155 |
retriever_bm25 = QueryFusionRetriever(
|
156 |
[
|
157 |
bm25_retriever,
|
|
|
7 |
import faiss
|
8 |
import nest_asyncio
|
9 |
import sys
|
10 |
+
import boto3
|
11 |
+
|
12 |
from pathlib import Path
|
13 |
from bs4 import BeautifulSoup
|
14 |
from typing import Union, List
|
|
|
49 |
Settings.chunk_size = 2048
|
50 |
Settings.similarity_top_k = 20
|
51 |
|
52 |
+
import boto3
|
53 |
+
import os
|
54 |
+
from pathlib import Path
|
55 |
+
|
56 |
+
# Параметри S3
|
57 |
+
BUCKET_NAME = "legal-position"
|
58 |
+
PREFIX_RETRIEVER = "Save_Index/bm25_retriever_es/"
|
59 |
+
FILE_DOCSTORE = "Save_Index/docstore_es_filter.json"
|
60 |
+
LOCAL_DIR = Path("local_data") # Локальна директорія для збереження даних
|
61 |
+
|
62 |
+
# Ініціалізація клієнта S3
|
63 |
+
s3_client = boto3.client(
|
64 |
+
"s3",
|
65 |
+
aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
|
66 |
+
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
67 |
+
region_name="eu-north-1"
|
68 |
+
)
|
69 |
+
|
70 |
+
# Створюємо локальну директорію, якщо вона не існує
|
71 |
+
LOCAL_DIR.mkdir(parents=True, exist_ok=True)
|
72 |
+
|
73 |
+
# Функція для завантаження файлу з S3
|
74 |
+
def download_s3_file(bucket_name, s3_key, local_path):
|
75 |
+
s3_client.download_file(bucket_name, s3_key, str(local_path))
|
76 |
+
print(f"Завантажено: {s3_key} -> {local_path}")
|
77 |
+
|
78 |
+
# Функція для завантаження папки з S3 у локальну директорію
|
79 |
+
def download_s3_folder(bucket_name, prefix, local_dir):
|
80 |
+
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
81 |
+
if 'Contents' in response:
|
82 |
+
for obj in response['Contents']:
|
83 |
+
s3_key = obj['Key']
|
84 |
+
# Отримуємо локальний шлях
|
85 |
+
local_file_path = local_dir / Path(s3_key).relative_to(prefix)
|
86 |
+
local_file_path.parent.mkdir(parents=True, exist_ok=True) # створення підкаталогів, якщо потрібно
|
87 |
+
# Завантажуємо файл
|
88 |
+
s3_client.download_file(bucket_name, s3_key, str(local_file_path))
|
89 |
+
print(f"Завантажено: {s3_key} -> {local_file_path}")
|
90 |
+
|
91 |
+
# Завантаження файлу `docstore_es_filter.json`
|
92 |
+
docstore_local_path = LOCAL_DIR / "docstore_es_filter.json"
|
93 |
+
download_s3_file(BUCKET_NAME, FILE_DOCSTORE, docstore_local_path)
|
94 |
+
|
95 |
+
# Завантаження папки `bm25_retriever_es`
|
96 |
+
bm25_retriever_local_dir = LOCAL_DIR / "bm25_retriever_es"
|
97 |
+
download_s3_folder(BUCKET_NAME, PREFIX_RETRIEVER, bm25_retriever_local_dir)
|
98 |
+
|
99 |
+
|
100 |
+
# PERSIST_DIR = "/home/docsa/Legal_Position/Save_index"
|
101 |
|
102 |
# Apply nest_asyncio to handle nested async calls
|
103 |
nest_asyncio.apply()
|
|
|
185 |
|
186 |
def initialize_components():
|
187 |
try:
|
188 |
+
# persist_path = Path(PERSIST_DIR)
|
189 |
+
persist_path = Path("local_data")
|
190 |
|
191 |
if not persist_path.exists():
|
192 |
raise FileNotFoundError(f"Directory not found: {persist_path}")
|
|
|
200 |
|
201 |
global retriever_bm25
|
202 |
|
203 |
+
|
204 |
docstore = SimpleDocumentStore.from_persist_path(str(persist_path / "docstore_es_filter.json"))
|
205 |
bm25_retriever = BM25Retriever.from_persist_dir(str(persist_path / "bm25_retriever_es"))
|
206 |
|
207 |
+
# docstore = SimpleDocumentStore.from_persist_path(str(persist_path / "docstore_es_filter.json"))
|
208 |
+
# bm25_retriever = BM25Retriever.from_persist_dir(str(persist_path / "bm25_retriever_es"))
|
209 |
+
|
210 |
retriever_bm25 = QueryFusionRetriever(
|
211 |
[
|
212 |
bm25_retriever,
|
requirements.txt
CHANGED
@@ -8,4 +8,5 @@ llama-index-embeddings-openai
|
|
8 |
llama-index-llms-openai
|
9 |
gradio
|
10 |
beautifulsoup4
|
11 |
-
nest-asyncio
|
|
|
|
8 |
llama-index-llms-openai
|
9 |
gradio
|
10 |
beautifulsoup4
|
11 |
+
nest-asyncio
|
12 |
+
boto3
|