{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-10-06 08:29:05.356100: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", "2023-10-06 08:29:07.144648: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "import tensorflow_text as text\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-10-06 08:29:56.220975: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.594000: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.597741: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.602848: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.606510: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.610005: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.802249: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.804413: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.806460: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", "2023-10-06 08:29:56.808332: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20719 MB memory: -> device: 0, name: NVIDIA A10G, pci bus id: 0000:00:1e.0, compute capability: 8.6\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.device(\"gpu\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0ParagraphAI_generated
00'O' Level were a British punk and hardcore pun...1
11Daisy Maud Bellis (February 16, 1887 – 1971) w...0
22The white-browed wagtail or large pied wagtail...1
33Judi Beecher is an American actress, voice art...1
44Natural Information Society is a music ensembl...1
\n", "
" ], "text/plain": [ " Unnamed: 0 Paragraph AI_generated\n", "0 0 'O' Level were a British punk and hardcore pun... 1\n", "1 1 Daisy Maud Bellis (February 16, 1887 – 1971) w... 0\n", "2 2 The white-browed wagtail or large pied wagtail... 1\n", "3 3 Judi Beecher is an American actress, voice art... 1\n", "4 4 Natural Information Society is a music ensembl... 1" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df= pd.read_csv(\"data/AI_checker_remade.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0
countmeanstdmin25%50%75%max
AI_generated
0150000.0150063.3201886567.6919961.075285.75150184.5225114.75299997.0
1150000.0149935.6798286637.9048500.074721.00149817.5224878.25299999.0
\n", "
" ], "text/plain": [ " Unnamed: 0 \\\n", " count mean std min 25% 50% \n", "AI_generated \n", "0 150000.0 150063.32018 86567.691996 1.0 75285.75 150184.5 \n", "1 150000.0 149935.67982 86637.904850 0.0 74721.00 149817.5 \n", "\n", " \n", " 75% max \n", "AI_generated \n", "0 225114.75 299997.0 \n", "1 224878.25 299999.0 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.groupby('AI_generated').describe()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AI_generated\n", "1 150000\n", "0 150000\n", "Name: count, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.AI_generated.value_counts()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from keys import get_base_embeddings, get_base_embeddings_bulk\n", "\n", "\n", "\n", "import json\n", "\n", "def get_embeddings(sentence):\n", " response= requests.post(get_base_embeddings_bulk, json={\n", " \"text\": sentence\n", "})\n", " return json.loads(response.content.decode('utf-8'))['embeddings']" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# embeddings= get_embeddings(df.Paragraph.values.tolist())" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# df['embeddings']= df.Paragraph.apply(get_embeddings)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"data/new_data_with_embeddings.csv\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(df.Paragraph, df.AI_generated, stratify=df.AI_generated)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "207977 A line lock is a device that prevents unauthor...\n", "232258 Volvariella bombycina, commonly known as the s...\n", "213059 Vithusha also spelt as Vidusha commonly known ...\n", "110203 The AECOM Building, formerly known as the Pent...\n", "267317 John Warren Aldrich (February 23, 1906 – May 2...\n", "Name: Paragraph, dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head(5)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "207977 1\n", "232258 0\n", "213059 1\n", "110203 0\n", "267317 1\n", "Name: AI_generated, dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train.head(5)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BERT model selected : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1\n", "Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3\n" ] } ], "source": [ "bert_model_name = 'small_bert/bert_en_uncased_L-4_H-128_A-2' \n", "\n", "map_name_to_handle = {\n", " 'bert_en_uncased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',\n", " 'bert_en_cased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',\n", " 'bert_multi_cased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',\n", " 'small_bert/bert_en_uncased_L-2_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-2_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-2_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-2_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',\n", " 'small_bert/bert_en_uncased_L-4_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-4_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-4_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-4_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',\n", " 'small_bert/bert_en_uncased_L-6_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-6_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-6_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-6_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',\n", " 'small_bert/bert_en_uncased_L-8_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-8_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-8_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-8_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',\n", " 'small_bert/bert_en_uncased_L-10_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-10_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-10_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-10_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',\n", " 'small_bert/bert_en_uncased_L-12_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',\n", " 'small_bert/bert_en_uncased_L-12_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',\n", " 'small_bert/bert_en_uncased_L-12_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',\n", " 'small_bert/bert_en_uncased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',\n", " 'albert_en_base':\n", " 'https://tfhub.dev/tensorflow/albert_en_base/2',\n", " 'electra_small':\n", " 'https://tfhub.dev/google/electra_small/2',\n", " 'electra_base':\n", " 'https://tfhub.dev/google/electra_base/2',\n", " 'experts_pubmed':\n", " 'https://tfhub.dev/google/experts/bert/pubmed/2',\n", " 'experts_wiki_books':\n", " 'https://tfhub.dev/google/experts/bert/wiki_books/2',\n", " 'talking-heads_base':\n", " 'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',\n", "}\n", "\n", "map_model_to_preprocess = {\n", " 'bert_en_uncased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'bert_en_cased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-2_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-2_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-2_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-2_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-4_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-4_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-4_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-4_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-6_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-6_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-6_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-6_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-8_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-8_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-8_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-8_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-10_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-10_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-10_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-10_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-12_H-128_A-2':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-12_H-256_A-4':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-12_H-512_A-8':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'small_bert/bert_en_uncased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'bert_multi_cased_L-12_H-768_A-12':\n", " 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',\n", " 'albert_en_base':\n", " 'https://tfhub.dev/tensorflow/albert_en_preprocess/3',\n", " 'electra_small':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'electra_base':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'experts_pubmed':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'experts_wiki_books':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", " 'talking-heads_base':\n", " 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',\n", "}\n", "\n", "tfhub_handle_encoder = map_name_to_handle[bert_model_name]\n", "tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]\n", "\n", "print(f'BERT model selected : {tfhub_handle_encoder}')\n", "print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "bert_preprocessor= hub.KerasLayer(tfhub_handle_preprocess)\n", "bert_encoder= hub.KerasLayer(tfhub_handle_encoder)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def get_sentence_embeddings(sentences):\n", " preprocessed_text= bert_preprocessor(sentences)\n", " return bert_encoder(preprocessed_text)['pooled_output']\n", " " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "TensorShape([2, 128])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_sentence_embeddings(\n", " [\n", " \"baby\", \"dont leave me\"\n", " ]\n", ").shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }