lyangas commited on
Commit
6304a81
·
1 Parent(s): e5128ee

init commit

Browse files
Files changed (5) hide show
  1. Dockerfile +13 -0
  2. app.py +67 -0
  3. model_finetuned_clear.pkl +3 -0
  4. required_classes.py +74 -0
  5. requirements.txt +5 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.8
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --upgrade -r /code/requirements.txt
8
+
9
+ COPY ./model_finetuned_clear.pkl ./model_finetuned_clear.pkl
10
+ COPY ./required_classes.py ./required_classes.py
11
+ COPY ./app.py ./app.py
12
+
13
+ CMD ["python", "app.py"]
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ print('INFO: import modules')
2
+ from flask import Flask, request
3
+ import json
4
+ import pickle
5
+ import numpy as np
6
+
7
+ from required_classes import BertEmbedder, PredictModel
8
+
9
+
10
+ print('INFO: loading model')
11
+ try:
12
+ with open('model_finetuned_clear.pkl', 'rb') as f:
13
+ model = pickle.load(f)
14
+ model.batch_size = 1
15
+ print('INFO: model loaded')
16
+ except Exception as e:
17
+ print(f"ERROR: loading models failed with: {str(e)}")
18
+
19
+ def classify_code(text, top_n):
20
+ embed = model._texts2vecs([text])
21
+ probs = model.classifier_code.predict_proba(embed)
22
+ best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
23
+ preds = [{'code': model.classifier_code.classes_[i], 'proba': probs[0][i]} for i in best_n]
24
+ return preds
25
+
26
+ def classify_group(text, top_n):
27
+ embed = model._texts2vecs([text])
28
+ probs = model.classifier_group.predict_proba(embed)
29
+ best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
30
+ preds = [{'group': model.classifier_group.classes_[i], 'proba': probs[0][i]} for i in best_n]
31
+ return preds
32
+
33
+
34
+ app = Flask(__name__)
35
+
36
+ @app.get("/")
37
+ def test_get():
38
+ return {'hello': 'world'}
39
+
40
+ @app.route("/test", methods=['POST'])
41
+ def test():
42
+ data = request.form
43
+ return {'response': data}
44
+
45
+ @app.route("/predict", methods=['POST'])
46
+ def read_root():
47
+ data = request.form
48
+ text = str(data['text'])
49
+ top_n = int(data['top_n'])
50
+
51
+ if top_n < 1:
52
+ return {'error': 'top_n should be geather than 0'}
53
+ if text.strip() == '':
54
+ return {'error': 'text is empty'}
55
+
56
+ pred_codes = classify_code(text, top_n)
57
+ pred_groups = classify_group(text, top_n)
58
+ result = {
59
+ "icd10":
60
+ {'result': pred_codes[0]['code'], 'details': pred_codes},
61
+ "dx_group":
62
+ {'result': pred_groups[0]['group'], 'details': pred_groups}
63
+ }
64
+ return result
65
+
66
+ if __name__ == "__main__":
67
+ app.run(host='0.0.0.0', port=7860)
model_finetuned_clear.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c40076019c4b4767021bf208200a8104f0910669d0b56952e6b2eb62b1539d3
3
+ size 434856921
required_classes.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List
3
+
4
+
5
+ class BertEmbedder:
6
+ def __init__(self, model_path:str, cut_head:bool=False):
7
+ """
8
+ cut_head = True if the model have classifier head
9
+ """
10
+ self.embedder = BertForSequenceClassification.from_pretrained(model_path)
11
+ self.max_length = self.embedder.config.max_position_embeddings
12
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, max_length=self.max_length)
13
+
14
+ if cut_head:
15
+ self.embedder = self.embedder.bert
16
+
17
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
18
+ self.embedder.to(self.device)
19
+
20
+ def __call__(self, text: str):
21
+ encoded_input = self.tokenizer(text,
22
+ return_tensors='pt',
23
+ max_length=self.max_length,
24
+ padding=True,
25
+ truncation=True).to(self.device)
26
+ model_output = self.embedder(**encoded_input)
27
+ text_embed = model_output.pooler_output[0].cpu()
28
+ return text_embed
29
+
30
+ def batch_predict(self, texts: List[str]):
31
+ encoded_input = self.tokenizer(texts,
32
+ return_tensors='pt',
33
+ max_length=self.max_length,
34
+ padding=True,
35
+ truncation=True).to(self.device)
36
+ model_output = self.embedder(**encoded_input)
37
+ texts_embeds = model_output.pooler_output.cpu()
38
+ return texts_embeds
39
+
40
+ class PredictModel:
41
+ def __init__(self, embedder, classifier, batch_size=8):
42
+ self.batch_size = batch_size
43
+ self.embedder = embedder
44
+ self.classifier = classifier
45
+
46
+ def _texts2vecs(self, texts, log=False):
47
+ embeds = []
48
+ batches_texts = np.array_split(texts, len(texts) // self.batch_size)
49
+ if log:
50
+ iterator = tqdm(batches_texts)
51
+ else:
52
+ iterator = batches_texts
53
+ for batch_texts in iterator:
54
+ batch_texts = batch_texts.tolist()
55
+ embeds += self.embedder.batch_predict(batch_texts).tolist()
56
+ embeds = np.array(embeds)
57
+ return embeds
58
+
59
+ def fit(self, texts: List[str], labels: List[str], log: bool=False):
60
+ if log:
61
+ print('Start text2vec transform')
62
+ embeds = self._texts2vecs(texts, log)
63
+ if log:
64
+ print('Start classifier fitting')
65
+ self.classifier.fit(embeds, labels)
66
+
67
+ def predict(self, texts: List[str], log: bool=False):
68
+ if log:
69
+ print('Start text2vec transform')
70
+ embeds = self._texts2vecs(texts, log)
71
+ if log:
72
+ print('Start classifier prediction')
73
+ prediction = self.classifier.predict(embeds)
74
+ return prediction
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy==1.22.4
2
+ torch==2.0.1
3
+ scikit-learn==1.2.2
4
+ transformers==4.29.2
5
+ flask==2.0.3