|
from gevent import pywsgi |
|
import dotenv |
|
dotenv.load_dotenv(override=True) |
|
|
|
import sys |
|
import time |
|
import argparse |
|
import uvicorn |
|
from typing import Union |
|
from pydantic import BaseModel |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import torch |
|
import openedai |
|
import numpy as np |
|
import onnxruntime as ort |
|
import asyncio |
|
from optimum.bettertransformer import BetterTransformer |
|
|
|
app = openedai.OpenAIStub() |
|
moderation = None |
|
device = "cpu" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
labels = ['hate', |
|
'hate_threatening', |
|
'harassment', |
|
'harassment_threatening', |
|
'self_harm', |
|
'self_harm_intent', |
|
'self_harm_instructions', |
|
'sexual', |
|
'sexual_minors', |
|
'violence', |
|
'violence_graphic', |
|
] |
|
|
|
label2id = {l:i for i, l in enumerate(labels)} |
|
id2label = {i:l for i, l in enumerate(labels)} |
|
model_name = "/root/autodl-tmp/duanyu027/moderation_0628" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels),id2label=id2label, label2id=label2id, problem_type = "multi_label_classification") |
|
model = torch.quantization.quantize_dynamic( |
|
model, {torch.nn.Linear}, dtype=torch.qint8 |
|
) |
|
torch.set_num_threads(1) |
|
class ModerationsRequest(BaseModel): |
|
model: str = "text-moderation-latest" |
|
input: Union[str, list[str]] |
|
|
|
@app.post("/v1/moderations") |
|
async def moderations(request: ModerationsRequest): |
|
results = { |
|
"id": f"modr-{int(time.time()*1e9)}", |
|
"model": "text-moderation-005", |
|
"results": [], |
|
} |
|
if isinstance(request.input, str): |
|
request.input = [request.input] |
|
|
|
thresholds = { |
|
"sexual": 0.1, |
|
"hate": 0.25, |
|
"harassment": 0.5, |
|
"self_harm": 0.25, |
|
"sexual_minors": 0.5, |
|
"hate_threatening": 0.2, |
|
"violence_graphic": 0.25, |
|
"self_harm_intent": 0.2, |
|
"self_harm_instructions": 0.25, |
|
"harassment_threatening": 0.1, |
|
"violence": 0.25, |
|
} |
|
|
|
for text in request.input: |
|
predictions = await predict(text, model, tokenizer) |
|
category_scores = {labels[i]: predictions[0][i].item() for i in range(len(labels))} |
|
detect = {key: score > thresholds[key] for key, score in category_scores.items()} |
|
detected = any(detect.values()) |
|
|
|
results['results'].append({ |
|
'flagged': detected, |
|
'categories': detect, |
|
'category_scores': category_scores, |
|
}) |
|
|
|
return results |
|
def sigmoid(x): |
|
return 1/(1 + np.exp(-x)) |
|
|
|
def parse_args(argv): |
|
parser = argparse.ArgumentParser(description='Moderation API') |
|
parser.add_argument('--host', type=str, default='0.0.0.0') |
|
parser.add_argument('--port', type=int, default=5002) |
|
parser.add_argument('--test-load', action='store_true') |
|
return parser.parse_args(argv) |
|
|
|
async def predict(text, model, tokenizer): |
|
encoding = tokenizer.encode_plus( |
|
text, |
|
return_tensors='pt' |
|
) |
|
input_ids = encoding['input_ids'].to(device) |
|
attention_mask = encoding['attention_mask'].to(device) |
|
|
|
|
|
def _predict(): |
|
with torch.no_grad(): |
|
outputs = model(input_ids, attention_mask=attention_mask) |
|
return torch.sigmoid(outputs.logits) |
|
|
|
loop = asyncio.get_running_loop() |
|
predictions = await loop.run_in_executor(None, _predict) |
|
|
|
|
|
del input_ids |
|
del attention_mask |
|
torch.cuda.empty_cache() |
|
|
|
return predictions |
|
|
|
if __name__ == "__main__": |
|
|
|
args = parse_args(sys.argv[1:]) |
|
|
|
print(f'Starting moderations[{device}] API on {args.host}:{args.port}', file=sys.stderr) |
|
app.register_model('text-moderations-latest', 'text-moderations-stable') |
|
app.register_model('text-moderations-005', 'text-moderations-ifmain') |
|
|
|
if not args.test_load: |
|
uvicorn.run(app, host=args.host, port=args.port) |
|
|