Spaces:
Sleeping
Sleeping
jellychoco
commited on
Commit
Β·
4244176
1
Parent(s):
6df1587
Add application file
Browse files- Dockerfile +25 -0
- app.py +102 -0
- requirements.txt +7 -0
Dockerfile
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# μμ€ν
ν¨ν€μ§ μ€μΉ
|
6 |
+
RUN apt-get update && apt-get install -y \
|
7 |
+
build-essential \
|
8 |
+
&& rm -rf /var/lib/apt/lists/*
|
9 |
+
|
10 |
+
# νμν Python ν¨ν€μ§ μ€μΉ
|
11 |
+
COPY requirements.txt .
|
12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
13 |
+
|
14 |
+
# μ ν리μΌμ΄μ
μ½λ 볡μ¬
|
15 |
+
COPY . .
|
16 |
+
|
17 |
+
# λΉνΉκΆ μ¬μ©μ μμ± λ° μ ν
|
18 |
+
RUN useradd -m -u 1000 user
|
19 |
+
USER user
|
20 |
+
|
21 |
+
# νκ²½ λ³μ μ€μ
|
22 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
23 |
+
|
24 |
+
# μλ² μ€ν
|
25 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import torch
|
3 |
+
from fastapi import FastAPI, HTTPException
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from typing import Union, Dict, Any
|
6 |
+
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, MarianMTModel, MarianTokenizer
|
7 |
+
|
8 |
+
# λͺ¨λΈ λ‘λ
|
9 |
+
M2M100_MODEL_NAME = "facebook/m2m100_418M"
|
10 |
+
OPUS_MT_MODEL_NAME = "Helsinki-NLP/opus-mt-en-ko"
|
11 |
+
|
12 |
+
# M2M100 (λ€κ΅μ΄ λ²μ)
|
13 |
+
m2m100_tokenizer = M2M100Tokenizer.from_pretrained(M2M100_MODEL_NAME)
|
14 |
+
m2m100_model = M2M100ForConditionalGeneration.from_pretrained(M2M100_MODEL_NAME)
|
15 |
+
|
16 |
+
# Helsinki-NLP Opus-MT (μμ΄ β νκ΅μ΄ μ μ©)
|
17 |
+
opus_tokenizer = MarianTokenizer.from_pretrained(OPUS_MT_MODEL_NAME)
|
18 |
+
opus_model = MarianMTModel.from_pretrained(OPUS_MT_MODEL_NAME)
|
19 |
+
|
20 |
+
# CPUμμ μ€ν
|
21 |
+
device = torch.device("cpu")
|
22 |
+
m2m100_model.to(device)
|
23 |
+
opus_model.to(device)
|
24 |
+
|
25 |
+
# FastAPI μ±
|
26 |
+
app = FastAPI()
|
27 |
+
|
28 |
+
# μμ² λ°μ΄ν° λͺ¨λΈ
|
29 |
+
class TranslationRequest(BaseModel):
|
30 |
+
model: str # μ¬μ©ν λͺ¨λΈ ("m2m100" λλ "opus-mt")
|
31 |
+
from_lang: str # μ
λ ₯ μΈμ΄ (μ: "ko", "en", "fr")
|
32 |
+
to: str # μΆλ ₯ μΈμ΄ (μ: "ko", "fr")
|
33 |
+
data: Dict[str, Any] # λ²μν JSON κ°μ²΄
|
34 |
+
|
35 |
+
# M2M100 λ²μ ν¨μ (λͺ¨λ μΈμ΄ μ§μ)
|
36 |
+
def translate_m2m100(text: str, src_lang: str, tgt_lang: str) -> str:
|
37 |
+
if not text.strip():
|
38 |
+
return text # λΉ λ¬Έμμ΄μ΄λ©΄ κ·Έλλ‘ λ°ν
|
39 |
+
|
40 |
+
m2m100_tokenizer.src_lang = src_lang
|
41 |
+
encoded_text = m2m100_tokenizer(text, return_tensors="pt").to(device)
|
42 |
+
generated_tokens = m2m100_model.generate(
|
43 |
+
**encoded_text, forced_bos_token_id=m2m100_tokenizer.get_lang_id(tgt_lang)
|
44 |
+
)
|
45 |
+
return m2m100_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
46 |
+
|
47 |
+
# Helsinki-NLP Opus-MT λ²μ ν¨μ (μμ΄ β νκ΅μ΄ μ μ©)
|
48 |
+
def translate_opus_mt(text: str, src_lang: str, tgt_lang: str) -> str:
|
49 |
+
if not text.strip():
|
50 |
+
return text # λΉ λ¬Έμμ΄μ΄λ©΄ κ·Έλλ‘ λ°ν
|
51 |
+
|
52 |
+
if src_lang == "en" and tgt_lang == "ko":
|
53 |
+
model_name = "Helsinki-NLP/opus-mt-en-ko"
|
54 |
+
elif src_lang == "ko" and tgt_lang == "en":
|
55 |
+
model_name = "Helsinki-NLP/opus-mt-ko-en"
|
56 |
+
else:
|
57 |
+
raise HTTPException(status_code=400, detail="Opus-MTλ μμ΄ β νκ΅μ΄λ§ μ§μν©λλ€.")
|
58 |
+
|
59 |
+
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
60 |
+
model = MarianMTModel.from_pretrained(model_name).to(device)
|
61 |
+
|
62 |
+
encoded_text = tokenizer(text, return_tensors="pt", padding=True).to(device)
|
63 |
+
generated_tokens = model.generate(**encoded_text)
|
64 |
+
return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
65 |
+
|
66 |
+
# μ¬κ·μ μΌλ‘ JSON λ²μ ν¨μ
|
67 |
+
def recursive_translate(json_obj: Union[Dict[str, Any], str], src_lang: str, tgt_lang: str, model_type: str):
|
68 |
+
if isinstance(json_obj, str): # λ¨μΌ λ¬Έμμ΄μ΄λ©΄ λ²μ
|
69 |
+
if model_type == "m2m100":
|
70 |
+
return translate_m2m100(json_obj, src_lang, tgt_lang)
|
71 |
+
elif model_type == "opus-mt":
|
72 |
+
return translate_opus_mt(json_obj, src_lang, tgt_lang)
|
73 |
+
elif isinstance(json_obj, dict): # λμ
λ리면 μ¬κ·μ μΌλ‘ λ²μ
|
74 |
+
return {key: recursive_translate(value, src_lang, tgt_lang, model_type) for key, value in json_obj.items()}
|
75 |
+
else:
|
76 |
+
return json_obj # μ«μ, 리μ€νΈ λ±μ λ²μνμ§ μκ³ κ·Έλλ‘ λ°ν
|
77 |
+
|
78 |
+
@app.post("/translate")
|
79 |
+
async def translate_json(request: TranslationRequest):
|
80 |
+
"""JSON λ°μ΄ν°λ₯Ό λ²μνλ API"""
|
81 |
+
model_type = request.model # "m2m100" λλ "opus-mt"
|
82 |
+
src_lang = request.from_lang
|
83 |
+
tgt_lang = request.to
|
84 |
+
input_data = request.data
|
85 |
+
|
86 |
+
# μ§μνλ μΈμ΄ λͺ©λ‘ (M2M100μ κ±°μ λͺ¨λ μΈμ΄ μ§μ)
|
87 |
+
supported_langs = ["ko", "en", "fr", "es", "ja", "zh", "de", "it"]
|
88 |
+
|
89 |
+
# λͺ¨λΈ μ ν
|
90 |
+
if model_type == "m2m100":
|
91 |
+
if src_lang not in supported_langs or tgt_lang not in supported_langs:
|
92 |
+
raise HTTPException(status_code=400, detail=f"μ§μλμ§ μλ μΈμ΄ μ½λ: {src_lang} β {tgt_lang}")
|
93 |
+
elif model_type == "opus-mt":
|
94 |
+
if not (src_lang in ["en", "ko"] and tgt_lang in ["en", "ko"]):
|
95 |
+
raise HTTPException(status_code=400, detail="Opus-MT λͺ¨λΈμ μμ΄ β νκ΅μ΄ λ²μλ§ μ§μν©λλ€.")
|
96 |
+
else:
|
97 |
+
raise HTTPException(status_code=400, detail="μ§μλμ§ μλ λͺ¨λΈ μ ν")
|
98 |
+
|
99 |
+
# μ¬κ·μ μΌλ‘ JSON λ²μ μ€ν
|
100 |
+
translated_data = recursive_translate(input_data, src_lang, tgt_lang, model_type)
|
101 |
+
|
102 |
+
return translated_data
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fastapi==0.104.1
|
2 |
+
uvicorn[standard]==0.24.0
|
3 |
+
pydantic==2.4.2
|
4 |
+
python-dotenv==1.0.0
|
5 |
+
transformers==4.35.2
|
6 |
+
sentencepiece==0.1.99
|
7 |
+
torch==2.1.1
|