jellychoco commited on
Commit
4244176
Β·
1 Parent(s): 6df1587

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -0
  2. app.py +102 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # μ‹œμŠ€ν…œ νŒ¨ν‚€μ§€ μ„€μΉ˜
6
+ RUN apt-get update && apt-get install -y \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # ν•„μš”ν•œ Python νŒ¨ν‚€μ§€ μ„€μΉ˜
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ½”λ“œ 볡사
15
+ COPY . .
16
+
17
+ # λΉ„νŠΉκΆŒ μ‚¬μš©μž 생성 및 μ „ν™˜
18
+ RUN useradd -m -u 1000 user
19
+ USER user
20
+
21
+ # ν™˜κ²½ λ³€μˆ˜ μ„€μ •
22
+ ENV PATH="/home/user/.local/bin:$PATH"
23
+
24
+ # μ„œλ²„ μ‹€ν–‰
25
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from typing import Union, Dict, Any
6
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer, MarianMTModel, MarianTokenizer
7
+
8
+ # λͺ¨λΈ λ‘œλ“œ
9
+ M2M100_MODEL_NAME = "facebook/m2m100_418M"
10
+ OPUS_MT_MODEL_NAME = "Helsinki-NLP/opus-mt-en-ko"
11
+
12
+ # M2M100 (λ‹€κ΅­μ–΄ λ²ˆμ—­)
13
+ m2m100_tokenizer = M2M100Tokenizer.from_pretrained(M2M100_MODEL_NAME)
14
+ m2m100_model = M2M100ForConditionalGeneration.from_pretrained(M2M100_MODEL_NAME)
15
+
16
+ # Helsinki-NLP Opus-MT (μ˜μ–΄ ↔ ν•œκ΅­μ–΄ μ „μš©)
17
+ opus_tokenizer = MarianTokenizer.from_pretrained(OPUS_MT_MODEL_NAME)
18
+ opus_model = MarianMTModel.from_pretrained(OPUS_MT_MODEL_NAME)
19
+
20
+ # CPUμ—μ„œ μ‹€ν–‰
21
+ device = torch.device("cpu")
22
+ m2m100_model.to(device)
23
+ opus_model.to(device)
24
+
25
+ # FastAPI μ•±
26
+ app = FastAPI()
27
+
28
+ # μš”μ²­ 데이터 λͺ¨λΈ
29
+ class TranslationRequest(BaseModel):
30
+ model: str # μ‚¬μš©ν•  λͺ¨λΈ ("m2m100" λ˜λŠ” "opus-mt")
31
+ from_lang: str # μž…λ ₯ μ–Έμ–΄ (예: "ko", "en", "fr")
32
+ to: str # 좜λ ₯ μ–Έμ–΄ (예: "ko", "fr")
33
+ data: Dict[str, Any] # λ²ˆμ—­ν•  JSON 객체
34
+
35
+ # M2M100 λ²ˆμ—­ ν•¨μˆ˜ (λͺ¨λ“  μ–Έμ–΄ 지원)
36
+ def translate_m2m100(text: str, src_lang: str, tgt_lang: str) -> str:
37
+ if not text.strip():
38
+ return text # 빈 λ¬Έμžμ—΄μ΄λ©΄ κ·ΈλŒ€λ‘œ λ°˜ν™˜
39
+
40
+ m2m100_tokenizer.src_lang = src_lang
41
+ encoded_text = m2m100_tokenizer(text, return_tensors="pt").to(device)
42
+ generated_tokens = m2m100_model.generate(
43
+ **encoded_text, forced_bos_token_id=m2m100_tokenizer.get_lang_id(tgt_lang)
44
+ )
45
+ return m2m100_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
46
+
47
+ # Helsinki-NLP Opus-MT λ²ˆμ—­ ν•¨μˆ˜ (μ˜μ–΄ ↔ ν•œκ΅­μ–΄ μ „μš©)
48
+ def translate_opus_mt(text: str, src_lang: str, tgt_lang: str) -> str:
49
+ if not text.strip():
50
+ return text # 빈 λ¬Έμžμ—΄μ΄λ©΄ κ·ΈλŒ€λ‘œ λ°˜ν™˜
51
+
52
+ if src_lang == "en" and tgt_lang == "ko":
53
+ model_name = "Helsinki-NLP/opus-mt-en-ko"
54
+ elif src_lang == "ko" and tgt_lang == "en":
55
+ model_name = "Helsinki-NLP/opus-mt-ko-en"
56
+ else:
57
+ raise HTTPException(status_code=400, detail="Opus-MTλŠ” μ˜μ–΄ ↔ ν•œκ΅­μ–΄λ§Œ μ§€μ›ν•©λ‹ˆλ‹€.")
58
+
59
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
60
+ model = MarianMTModel.from_pretrained(model_name).to(device)
61
+
62
+ encoded_text = tokenizer(text, return_tensors="pt", padding=True).to(device)
63
+ generated_tokens = model.generate(**encoded_text)
64
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
65
+
66
+ # μž¬κ·€μ μœΌλ‘œ JSON λ²ˆμ—­ ν•¨μˆ˜
67
+ def recursive_translate(json_obj: Union[Dict[str, Any], str], src_lang: str, tgt_lang: str, model_type: str):
68
+ if isinstance(json_obj, str): # 단일 λ¬Έμžμ—΄μ΄λ©΄ λ²ˆμ—­
69
+ if model_type == "m2m100":
70
+ return translate_m2m100(json_obj, src_lang, tgt_lang)
71
+ elif model_type == "opus-mt":
72
+ return translate_opus_mt(json_obj, src_lang, tgt_lang)
73
+ elif isinstance(json_obj, dict): # λ”•μ…”λ„ˆλ¦¬λ©΄ μž¬κ·€μ μœΌλ‘œ λ²ˆμ—­
74
+ return {key: recursive_translate(value, src_lang, tgt_lang, model_type) for key, value in json_obj.items()}
75
+ else:
76
+ return json_obj # 숫자, 리슀트 등은 λ²ˆμ—­ν•˜μ§€ μ•Šκ³  κ·ΈλŒ€λ‘œ λ°˜ν™˜
77
+
78
+ @app.post("/translate")
79
+ async def translate_json(request: TranslationRequest):
80
+ """JSON 데이터λ₯Ό λ²ˆμ—­ν•˜λŠ” API"""
81
+ model_type = request.model # "m2m100" λ˜λŠ” "opus-mt"
82
+ src_lang = request.from_lang
83
+ tgt_lang = request.to
84
+ input_data = request.data
85
+
86
+ # μ§€μ›ν•˜λŠ” μ–Έμ–΄ λͺ©λ‘ (M2M100은 거의 λͺ¨λ“  μ–Έμ–΄ 지원)
87
+ supported_langs = ["ko", "en", "fr", "es", "ja", "zh", "de", "it"]
88
+
89
+ # λͺ¨λΈ 선택
90
+ if model_type == "m2m100":
91
+ if src_lang not in supported_langs or tgt_lang not in supported_langs:
92
+ raise HTTPException(status_code=400, detail=f"μ§€μ›λ˜μ§€ μ•ŠλŠ” μ–Έμ–΄ μ½”λ“œ: {src_lang} β†’ {tgt_lang}")
93
+ elif model_type == "opus-mt":
94
+ if not (src_lang in ["en", "ko"] and tgt_lang in ["en", "ko"]):
95
+ raise HTTPException(status_code=400, detail="Opus-MT λͺ¨λΈμ€ μ˜μ–΄ ↔ ν•œκ΅­μ–΄ λ²ˆμ—­λ§Œ μ§€μ›ν•©λ‹ˆλ‹€.")
96
+ else:
97
+ raise HTTPException(status_code=400, detail="μ§€μ›λ˜μ§€ μ•ŠλŠ” λͺ¨λΈ 선택")
98
+
99
+ # μž¬κ·€μ μœΌλ‘œ JSON λ²ˆμ—­ μ‹€ν–‰
100
+ translated_data = recursive_translate(input_data, src_lang, tgt_lang, model_type)
101
+
102
+ return translated_data
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi==0.104.1
2
+ uvicorn[standard]==0.24.0
3
+ pydantic==2.4.2
4
+ python-dotenv==1.0.0
5
+ transformers==4.35.2
6
+ sentencepiece==0.1.99
7
+ torch==2.1.1