bite-the-byte
/

byt5-small-deASCIIfy-TR

Token Classification

PEFT

Safetensors

Turkish

Model card Files Files and versions Community

emircanerol commited on May 10, 2024

Commit

0d155e5

verified ·

1 Parent(s): 67e22a8

Update README.md

Browse files

Files changed (1) hide show

README.md +21 -41

README.md CHANGED Viewed

@@ -9,28 +9,14 @@ pipeline_tag: token-classification
 ---
 ```python
-from peft import PeftModel, prepare_model_for_kbit_training
-from transformers import T5ForTokenClassification, BitsAndBytesConfig
-import torch
-model_id = "google/byt5-small"
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-)
-model = T5ForTokenClassification.from_pretrained(model_id,
-                                                num_labels=2,
-                                                torch_dtype=torch.bfloat16,
-                                                quantization_config=bnb_config,
-                                                device_map="auto",)
-model = prepare_model_for_kbit_training(model)
-model = PeftModel.from_pretrained(model, 'bite-the-byte/byt5-small-deASCIIfy-TR')
-def test_mask(data):
     """
     Masks the padded tokens in the input.
     Args:
@@ -39,21 +25,16 @@ def test_mask(data):
         dataset (list): List of dictionaries.
     """
-    dataset = list()
-    for sample in data:
-        new_sample = dict()
-        input_tokens = [i + 3 for i in sample.encode('utf-8')]
-        input_tokens.append(0) # eos token
-        new_sample['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64)
-        # Create attention mask
-        attention_mask = [1] * len(input_tokens)  # Attend to all tokens
-        new_sample['attention_mask'] = torch.tensor([attention_mask], dtype=torch.int64)
-        dataset.append(new_sample)
-    return dataset
 def rewrite(model, data):
     """
@@ -66,24 +47,23 @@ def rewrite(model, data):
     """
     with torch.no_grad():
-        data = {k: v.to(model.device) for k, v in data.items()}
-        pred = torch.argmax(model(**data).logits, dim=2)
     output = list() # save the indices of the characters as list of integers
     # Conversion table for Turkish characters {100: [300, 350], ...}
     en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))}
-    for inp, lab in zip((data['input_ids'] - 3)[0].tolist(), pred[0].tolist()):
         if lab and inp in en2tr:
             # if the model predicts a diacritic, replace it with the corresponding Turkish character
             output.extend(en2tr[inp])
         elif inp >= 0: output.append(inp)
     return bytes(output).decode()
-def try_it(text, model):#=model):
-    sample = test_mask([text])
-    return rewrite(model, sample[0])
 try_it('Cekoslovakyalilastiramadiklarimizdan misiniz?', model)
 ```

 ---
 ```python
+from peft import PeftModel, PeftConfig
+from transformers import AutoModelForTokenClassification
+config = PeftConfig.from_pretrained("bite-the-byte/byt5-small-deASCIIfy-TR")
+model = AutoModelForTokenClassification.from_pretrained("google/byt5-small")
+model = PeftModel.from_pretrained(model, "bite-the-byte/byt5-small-deASCIIfy-TR")
+def test_mask(device, sample):
     """
     Masks the padded tokens in the input.
     Args:
         dataset (list): List of dictionaries.
     """
+    tokens = dict()
+    input_tokens = [i + 3 for i in sample.encode('utf-8')]
+    input_tokens.append(0) # eos token
+    tokens['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64, device=device)
+    # Create attention mask
+    tokens['attention_mask'] = torch.ones_like(tokens['input_ids'], dtype=torch.int64, device=device)
+    return tokens
 def rewrite(model, data):
     """
     """
     with torch.no_grad():
+        pred = torch.argmax(model(**data).logits, dim=2).squeeze(0)
     output = list() # save the indices of the characters as list of integers
     # Conversion table for Turkish characters {100: [300, 350], ...}
     en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))}
+    for inp, lab in zip((data['input_ids'].squeeze(0) - 3).tolist(), pred.tolist()):
         if lab and inp in en2tr:
             # if the model predicts a diacritic, replace it with the corresponding Turkish character
             output.extend(en2tr[inp])
         elif inp >= 0: output.append(inp)
     return bytes(output).decode()
+def try_it(text, model):
+    sample = test_mask(model.device, text)
+    return rewrite(model, sample)
 try_it('Cekoslovakyalilastiramadiklarimizdan misiniz?', model)
 ```