Upload 11 files

Files changed (11) hide show

README.md CHANGED Viewed

@@ -1,3 +1,60 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+language:
+- en
+base_model:
+- FacebookAI/roberta-base
+pipeline_tag: text-classification
+library_name: transformers
+---
+# RoBERTa-base AI Text Detector
+Finetuned RoBERTa-base model for detecting AI generated English texts.
+See [FakespotAILabs/ApolloDFT](https://github.com/FakespotAILabs/ApolloDFT) for more details and a technical report of the model and experiments we conducted.
+## How to use
+You can use this model directly with a pipeline.
+For better performance, you should apply the `clean_text` function in [utils.py](utils.py).
+```python
+from transformers import pipeline
+from utils import clean_text
+classifier = pipeline(
+    "text-classification",
+    model="fakespot-ai/roberta-base-ai-text-detection-v1"
+)
+# single text
+text = "text 1"
+classifier(clean_text(text))
+[
+    {
+        'label': str,
+        'score': float
+    }
+]
+# list of texts
+texts = ["text 1", "text 2"]
+classifier([clean_text(t) for t in texts])
+[
+    {
+        'label': str,
+        'score': float
+    },
+    {
+        'label': str,
+        'score': float
+    }
+]
+```
+## Disclaimer
+- The model's score represents an estimation of the likelihood of the input text being AI-generated or human-written, rather than indicating the proportion of the text that is AI-generated or human-written.
+- The accuracy and performance of the model generally improve with longer text inputs.

config.json ADDED Viewed

+{
+  "_name_or_path": "FacebookAI/roberta-base",
+  "architectures": [
+    "RobertaForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "Human",
+    "1": "AI"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "AI": 1,
+    "Human": 0
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:adc87ded15a8fbea26dec51a747adfd59ad1e9021073287e7e5e29209564e56c
+size 498612824

pytorch_model.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8624aaf898074697c13a698ddcfe404a2b5fa5f275f6412343f2b0f946e4b7d
+size 498669998

special_tokens_map.json ADDED Viewed

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

training_args.bin ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:22a1d8262404d459ac805c652c9b1c7b4f04481defe45a801fc449884f996ef3
+size 5240

utils.py ADDED Viewed

+import re
+from html import unescape
+def clean_text(t):
+    t = clean_markdown(t)
+    t = t.replace("\n"," ")
+    t = t.replace("\t"," ")
+    t = t.replace("^M"," ")
+    t = t.replace("\r"," ")
+    t = t.replace(" ,", ",")
+    t = re.sub(" +", " ", t)
+    return t
+def clean_markdown(md_text):
+    # Remove code blocks
+    md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL)
+    # Remove inline code
+    md_text = re.sub(r'`[^`]*`', '', md_text)
+    # Remove images
+    md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
+    # Remove links but keep link text
+    md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text)
+    # Remove bold and italic (groups of *, _)
+    md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text)
+    md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text)
+    # Remove headings
+    md_text = re.sub(r'#+ ', '', md_text)
+    # Remove blockquotes
+    md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE)
+    # Remove list markers
+    md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE)
+    # Remove horizontal rules
+    md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE)
+    # Remove tables
+    md_text = re.sub(r'\|.*?\|', '', md_text)
+    # Remove raw HTML tags
+    md_text = re.sub(r'<.*?>', '', md_text)
+    # Decode HTML entities
+    md_text = unescape(md_text)
+    return md_text

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff