Uploaded model and tokenizer files

Browse files

Files changed (6) hide show

README.md +92 -2
config.json +33 -0
pytorch_model.bin +3 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +11 -0

README.md CHANGED Viewed

@@ -1,5 +1,95 @@
 ---
 license: apache-2.0
 base_model:
-- zhihan1996/DNABERT-2-117M
----

 ---
 license: apache-2.0
 base_model:
+  - zhihan1996/DNABERT-2-117M
+tags:
+  - biology
+  - medical
+---
+This is one of the fine-tuned models, named STL model, from [zhihan1996/DNABERT-2-117M
+](https://huggingface.co/zhihan1996/DNABERT-2-117M).
+The STL model can predict the RNA offtarget induced by cytosine base editors (CBEs).
+Here is an example of using the model for RNA-off-target prediction.
+**pred_rna_offtarget.py:**
+```python
+import sys
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+__authors__ = ["Kazuki Nakamae"]
+__version__ = "1.0.0"
+def pred_rna_offtarget(dna, model_dir):
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # モデル仕様に利用するデバイスの設定：CUDAが使えなければCPUを使うように設定
+        tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) # 訓練済みモデルが理解できるフォーマットを指定
+        model = AutoModelForSequenceClassification.from_pretrained(model_dir, trust_remote_code=True).to(device) # 分類処理用に訓練済みモデルを読み込み
+    except Exception as e:
+        print(f"Error loading model from {model_dir}: {e}")
+        sys.exit(1)
+    inputs = tokenizer(dna, return_tensors='pt') # 入力テキストを訓練済みモデルが理解できるフォーマット（トークン）に分割し、各トークンを「トークンID」に変換
+    model.eval() # モデルの状態を推論モード（訓練モードでない）に指定
+    with torch.no_grad():# テンソルの勾配計算OFF：省メモリ化
+      # モデルのインプットとして渡して評価
+      outputs = model(
+          inputs["input_ids"].to(device),
+          inputs["attention_mask"].to(device),
+        )
+    # print(outputs.logits)
+    # 例：tensor([[-1.6488,  1.4636]])という形で出力
+    # [Negativeの評価値, Positiveの評価値]というような形式となっている。
+    y_preds = np.argmax(outputs.logits.to('cpu').detach().numpy().copy(), axis=1) # 評価値が高い方のインデックス取り出し+numpy化
+    # モデルからラベル判定を読み出す
+    def id2label(x):
+        return model.config.id2label[x]
+    y_dash = [id2label(x) for x in y_preds] # 評価値が高い方のラベル判定を取り出す
+    print(y_dash)
+    # 例：['LABEL_1']という形で出力
+    # LABEL_0: Negative / LABEL_1: Positive という意味
+    return (dna, y_dash)
+def print_usage():
+    print(f"Usage: {sys.argv[0]} <input DNA sequence> <DNABERT-2 model directory>")
+    print("Options:")
+    print("  -h, --help    Show this help message and exit")
+    print("  -v, --version Show version information and exit")
+def print_version():
+    print(f"{sys.argv[0]} version {__version__}")
+    print("Authors:", ", ".join(__authors__))
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        if len(sys.argv) == 2 and sys.argv[1] in ("-h", "--help"):
+            print_usage()
+            sys.exit(0)
+        elif len(sys.argv) == 2 and sys.argv[1] in ("-v", "--version"):
+            print_version()
+            sys.exit(0)
+        else:
+            print_usage()
+            sys.exit(1)
+    dna = sys.argv[1]
+    model_dir = sys.argv[2]
+    pred_rna_offtarget(dna, model_dir)
+```
+```bash
+$ python pred_rna_offtarget.py GGCAGGGCTGGGGAAGCTTACTGTGTCCAAGAGCCTGCTG KazukiNakamae/STLmodel;
+['LABEL_1']
+$ python pred_rna_offtarget.py GTCATCTAACAAAAATATTCCGTTGCAGGAAAAGCAAGCT KazukiNakamae/STLmodel;
+['LABEL_0']
+```
+#### Developers of the fine-tuned model
+- [Takayuki Suzuki](https://github.com/szktkyk)

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "tmp/DNABERT-2-CBE_Suzuki_v1/",
+  "alibi_starting_size": 512,
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "zhihan1996/DNABERT-2-117M--configuration_bert.BertConfig",
+    "AutoModel": "zhihan1996/DNABERT-2-117M--bert_layers.BertModel",
+    "AutoModelForMaskedLM": "zhihan1996/DNABERT-2-117M--bert_layers.BertForMaskedLM",
+    "AutoModelForSequenceClassification": "zhihan1996/DNABERT-2-117M--bert_layers.BertForSequenceClassification"
+  },
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 4096
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:61fd38248ed199c7b809140a8ae0c96267228f6adea1bd73d6145e14dfdafe08
+size 468326010

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "model_max_length": 10,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "[UNK]"
+}