lfoppiano commited on
Commit
f59f4d6
·
1 Parent(s): 9897f0e

add qwen inference for modal, update the viewer

Browse files
document_qa/deployment/{modal_inference.py → modal_inference_phi.py} RENAMED
File without changes
document_qa/deployment/modal_inference_qwen.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import modal
4
+
5
+ vllm_image = (
6
+ modal.Image.debian_slim(python_version="3.11")
7
+ .pip_install(
8
+ "vllm",
9
+ "transformers>=4.51.0",
10
+ "huggingface_hub[hf_transfer]>=0.26.2",
11
+ "flashinfer-python==0.2.0.post2", # pinning, very unstable
12
+ extra_index_url="https://flashinfer.ai/whl/cu124/torch2.5",
13
+ )
14
+ .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
15
+ )
16
+
17
+ MODELS_DIR = "/llamas"
18
+ MODEL_NAME = "Qwen/Qwen3-0.6B"
19
+ MODEL_REVISION = "e6de91484c29aa9480d55605af694f39b081c455"
20
+
21
+ hf_cache_vol = modal.Volume.from_name("huggingface-cache", create_if_missing=True)
22
+ vllm_cache_vol = modal.Volume.from_name("vllm-cache", create_if_missing=True)
23
+
24
+
25
+ app = modal.App("gwen-0.6b-qa-vllm")
26
+
27
+ N_GPU = 1
28
+ MINUTES = 60 # seconds
29
+ VLLM_PORT = 8000
30
+
31
+
32
+ @app.function(
33
+ image=vllm_image,
34
+ # gpu=f"L40S:{N_GPU}",
35
+ gpu=f"A10G:{N_GPU}",
36
+ # how long should we stay up with no requests?
37
+ scaledown_window=5 * MINUTES,
38
+ volumes={
39
+ "/root/.cache/huggingface": hf_cache_vol,
40
+ "/root/.cache/vllm": vllm_cache_vol,
41
+ },
42
+ secrets=[modal.Secret.from_name("document-qa-api-key")]
43
+ )
44
+ @modal.concurrent(
45
+ max_inputs=5
46
+ ) # how many requests can one replica handle? tune carefully!
47
+ @modal.web_server(port=VLLM_PORT, startup_timeout=5 * MINUTES)
48
+ def serve():
49
+ import subprocess
50
+
51
+ cmd = [
52
+ "vllm",
53
+ "serve",
54
+ "--uvicorn-log-level=info",
55
+ MODEL_NAME,
56
+ "--revision",
57
+ MODEL_REVISION,
58
+ "--enable-reasoning",
59
+ "--reasoning-parser",
60
+ "deepseek_r1",
61
+ "--max-model-len",
62
+ "32768",
63
+ "--host",
64
+ "0.0.0.0",
65
+ "--port",
66
+ str(VLLM_PORT),
67
+ "--api-key",
68
+ os.environ["API_KEY"],
69
+ ]
70
+
71
+ subprocess.Popen(" ".join(cmd), shell=True)
requirements.txt CHANGED
@@ -29,6 +29,6 @@ typing-inspect==0.9.0
29
  typing_extensions==4.12.2
30
  pydantic==2.10.6
31
  sentence-transformers==2.6.1
32
- streamlit-pdf-viewer==0.0.22rc0
33
  umap-learn==0.5.6
34
  plotly==5.20.0
 
29
  typing_extensions==4.12.2
30
  pydantic==2.10.6
31
  sentence-transformers==2.6.1
32
+ streamlit-pdf-viewer==0.0.25
33
  umap-learn==0.5.6
34
  plotly==5.20.0
streamlit_app.py CHANGED
@@ -19,7 +19,8 @@ from document_qa.document_qa_engine import DocumentQAEngine, DataStorage
19
  from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
20
 
21
  API_MODELS = {
22
- "microsoft/Phi-4-mini-instruct": os.environ["LM_URL"]
 
23
  }
24
 
25
  API_EMBEDDINGS = {
 
19
  from document_qa.grobid_processors import GrobidAggregationProcessor, decorate_text_with_annotations
20
 
21
  API_MODELS = {
22
+ "microsoft/Phi-4-mini-instruct": os.environ["PHI_URL"],
23
+ "Qwen/Qwen3-0.6B": os.environ["QWEN_URL"]
24
  }
25
 
26
  API_EMBEDDINGS = {