Spaces:
Runtime error
Runtime error
Update src/main.py
Browse files- src/main.py +7 -1
src/main.py
CHANGED
@@ -21,12 +21,18 @@ from src.models import chunk_config, embed_config, WebhookPayload
|
|
21 |
logging.basicConfig(level=logging.INFO)
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
|
|
24 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
25 |
|
|
|
26 |
TEI_URL = os.getenv("TEI_URL")
|
|
|
27 |
CHUNKED_DS_NAME = os.getenv("CHUNKED_DS_NAME")
|
|
|
28 |
EMBED_DS_NAME = os.getenv("EMBED_DS_NAME")
|
|
|
29 |
INPUT_SPLITS = os.getenv("INPUT_SPLITS")
|
|
|
30 |
INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
|
31 |
|
32 |
INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
|
@@ -183,7 +189,7 @@ def wake_up_endpoint(url):
|
|
183 |
def embed_dataset(ds_name):
|
184 |
logger.info("Update detected, embedding is scheduled")
|
185 |
wake_up_endpoint(TEI_URL)
|
186 |
-
input_ds = load_dataset(ds_name, split="
|
187 |
with tempfile.NamedTemporaryFile(mode="a", suffix=".jsonl") as temp_file:
|
188 |
asyncio.run(embed(input_ds, temp_file))
|
189 |
|
|
|
21 |
logging.basicConfig(level=logging.INFO)
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
+
# you token from Settings
|
25 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
26 |
|
27 |
+
# URL of TEI endpoint
|
28 |
TEI_URL = os.getenv("TEI_URL")
|
29 |
+
# name of chunked dataset
|
30 |
CHUNKED_DS_NAME = os.getenv("CHUNKED_DS_NAME")
|
31 |
+
# name of embeddings dataset
|
32 |
EMBED_DS_NAME = os.getenv("EMBED_DS_NAME")
|
33 |
+
# splits of input dataset to process, comma separated
|
34 |
INPUT_SPLITS = os.getenv("INPUT_SPLITS")
|
35 |
+
# name of column to load from input dataset
|
36 |
INPUT_TEXT_COL = os.getenv("INPUT_TEXT_COL")
|
37 |
|
38 |
INPUT_SPLITS = [spl.strip() for spl in INPUT_SPLITS.split(",") if spl]
|
|
|
189 |
def embed_dataset(ds_name):
|
190 |
logger.info("Update detected, embedding is scheduled")
|
191 |
wake_up_endpoint(TEI_URL)
|
192 |
+
input_ds = load_dataset(ds_name, split="train")
|
193 |
with tempfile.NamedTemporaryFile(mode="a", suffix=".jsonl") as temp_file:
|
194 |
asyncio.run(embed(input_ds, temp_file))
|
195 |
|