{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..');" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from datasets import Dataset, load_dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading readme: 100%|██████████| 2.45k/2.45k [00:00<00:00, 14.5MB/s]\n", "Downloading data: 100%|██████████| 2.56M/2.56M [00:00<00:00, 4.77MB/s]\n", "Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1.84it/s]\n", "Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 603.06it/s]\n", "Generating evaluation split: 100%|██████████| 238/238 [00:00<00:00, 4242.81 examples/s]\n" ] } ], "source": [ "# dataset_name= \"aadityaubhat/GPT-wiki-intro\"\n", "dataset_name= \"potsawee/wiki_bio_gpt3_hallucination\"\n", "\n", "\n", "\n", "dataset= load_dataset(dataset_name)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " evaluation: Dataset({\n", " features: ['gpt3_text', 'wiki_bio_text', 'gpt3_sentences', 'annotation', 'wiki_bio_test_idx', 'gpt3_text_samples'],\n", " num_rows: 238\n", " })\n", "})" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 5.62ba/s]\n" ] }, { "data": { "text/plain": [ "5060493" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['evaluation'].to_csv(\"data/original_data_gpt3.csv\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idurltitlewiki_introgenerated_introtitle_lenwiki_intro_lengenerated_intro_lenpromptgenerated_textprompt_tokensgenerated_text_tokens
063064638https://en.wikipedia.org/wiki/Sexhow%20railway...Sexhow railway stationSexhow railway station was a railway station b...Sexhow railway station was a railway station l...317478200 word wikipedia style introduction on 'Sexh...located in the town of Sexhow, on the Cumbria...2588
1279621https://en.wikipedia.org/wiki/Eti%C3%A4inenEtiäinenIn Finnish folklore, all places and things, an...In Finnish folklore, all places and things, an...118780200 word wikipedia style introduction on 'Etiä...animate or inanimate, have a spirit or \"etiäi...26101
2287229https://en.wikipedia.org/wiki/Inverse%20functi...Inverse function theoremIn mathematics, specifically differential calc...In mathematics, specifically differential calc...317059200 word wikipedia style introduction on 'Inve...function theorem states that for every real-v...2665
326712375https://en.wikipedia.org/wiki/Stepping%20on%20...Stepping on Rosesis a Japanese shōjo manga series written and i...is a Japanese shōjo manga series written and i...3335121200 word wikipedia style introduction on 'Step...and illustrated by Maki Fujii. The series fol...26150
438894426https://en.wikipedia.org/wiki/Rob%20BradleyRob BradleyRobert Milner \"Rob\" Bradley, Jr. (born August ...Robert Milner \"Rob\" Bradley, Jr. (born August ...2170136200 word wikipedia style introduction on 'Rob ...29, 1973) is an American former professional ...28162
\n", "
" ], "text/plain": [ " id url \\\n", "0 63064638 https://en.wikipedia.org/wiki/Sexhow%20railway... \n", "1 279621 https://en.wikipedia.org/wiki/Eti%C3%A4inen \n", "2 287229 https://en.wikipedia.org/wiki/Inverse%20functi... \n", "3 26712375 https://en.wikipedia.org/wiki/Stepping%20on%20... \n", "4 38894426 https://en.wikipedia.org/wiki/Rob%20Bradley \n", "\n", " title \\\n", "0 Sexhow railway station \n", "1 Etiäinen \n", "2 Inverse function theorem \n", "3 Stepping on Roses \n", "4 Rob Bradley \n", "\n", " wiki_intro \\\n", "0 Sexhow railway station was a railway station b... \n", "1 In Finnish folklore, all places and things, an... \n", "2 In mathematics, specifically differential calc... \n", "3 is a Japanese shōjo manga series written and i... \n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... \n", "\n", " generated_intro title_len \\\n", "0 Sexhow railway station was a railway station l... 3 \n", "1 In Finnish folklore, all places and things, an... 1 \n", "2 In mathematics, specifically differential calc... 3 \n", "3 is a Japanese shōjo manga series written and i... 3 \n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... 2 \n", "\n", " wiki_intro_len generated_intro_len \\\n", "0 174 78 \n", "1 187 80 \n", "2 170 59 \n", "3 335 121 \n", "4 170 136 \n", "\n", " prompt \\\n", "0 200 word wikipedia style introduction on 'Sexh... \n", "1 200 word wikipedia style introduction on 'Etiä... \n", "2 200 word wikipedia style introduction on 'Inve... \n", "3 200 word wikipedia style introduction on 'Step... \n", "4 200 word wikipedia style introduction on 'Rob ... \n", "\n", " generated_text prompt_tokens \\\n", "0 located in the town of Sexhow, on the Cumbria... 25 \n", "1 animate or inanimate, have a spirit or \"etiäi... 26 \n", "2 function theorem states that for every real-v... 26 \n", "3 and illustrated by Maki Fujii. The series fol... 26 \n", "4 29, 1973) is an American former professional ... 28 \n", "\n", " generated_text_tokens \n", "0 88 \n", "1 101 \n", "2 65 \n", "3 150 \n", "4 162 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df= pd.read_csv(\"data/original_data.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idurltitlewiki_introgenerated_introtitle_lenwiki_intro_lengenerated_intro_lenpromptgenerated_textprompt_tokensgenerated_text_tokens
526709147https://en.wikipedia.org/wiki/MoluccansMoluccansMoluccans are the Austronesian-speaking and Pa...Moluccans are the Austronesian-speaking and Pa...1253164200 word wikipedia style introduction on 'Molu...groups inhabiting the Maluku Islands. The ter...33238
\n", "
" ], "text/plain": [ " id url title \\\n", "5 26709147 https://en.wikipedia.org/wiki/Moluccans Moluccans \n", "\n", " wiki_intro \\\n", "5 Moluccans are the Austronesian-speaking and Pa... \n", "\n", " generated_intro title_len \\\n", "5 Moluccans are the Austronesian-speaking and Pa... 1 \n", "\n", " wiki_intro_len generated_intro_len \\\n", "5 253 164 \n", "\n", " prompt \\\n", "5 200 word wikipedia style introduction on 'Molu... \n", "\n", " generated_text prompt_tokens \\\n", "5 groups inhabiting the Maluku Islands. The ter... 33 \n", "\n", " generated_text_tokens \n", "5 238 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# df_0= df.head(1)\n", "# df_0= df.iloc[1] # series result \n", "df_0= df.iloc[[5]] # dataframe result\n", "\n", "df_0" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "title= ['Moluccans']\n", "\n", "wiki_intro= ['Moluccans are the Austronesian-speaking and Papuan-speaking ethnic groups indigenous to the Maluku Islands, also called the Moluccas and historically known as the Spice Islands, which as a region has been annexed by Indonesia since the end of 1950. As such, \"Moluccans\" is used as a blanket term for the various ethnic and linguistic groups native to the islands.\\n\\t\\nThe original inhabitants of the Maluku Islands were Austronesian and Melanesian in origin. Austronesian peoples partially assimilated the native Melanesian population in terms of linguistics and other areas, around 2000\\xa0BCE due to extensive trade, making Malayo-Polynesian creole languages the lingua franca in most of the region. Later added to this were several Dutch, Chinese, Portuguese, Spanish, Arabian and English influences due to colonization, marriage with foreign traders during the Silk-route era and Middle ages, and even with European soldiers during the World Wars. A small number of German descendants was added to Moluccan population, especially in Ambon, along with arrival of Protestant Missionaries since 16th century. Moluccans are predominantly Christian like many other Melanesians, but Muslim villages are also present. Despite religious differences, all groups share strong cultural bonds and a sense of common identity, such as through Adat, Pela and Bongso traditions. Music is also a binding factor, playing an important role in the cultural identity. Moluccans historically tend to be a musically gifted people, excelling in creative areas such as singing and sports. In recognition, the Moluccan capital city of Ambon was awarded the official status of City of Music by UNESCO in 2019.']\n", "\n", "generated_intro= ['Moluccans are the Austronesian-speaking and Papuan-speaking ethnic groups inhabiting the Maluku Islands. The term \"Moluccan\" is an umbrella term that covers the various Austronesian and Papuan languages spoken on the islands. The largest group of Moluccans are the Tolo-speaking people.\\n\\nThe Maluku Islands are a group of volcanic islands in eastern Indonesia, located about 1,000 kilometres east of Java and 2,000 kilometres south of New Guinea. The islands comprise over 700 islands, with a total land area of approximately 245,000 square kilometres. They have a population of around 1 million people, most of whom are Muslim. The largest island group is the Tolo-speaking region, which comprises over 60% of the population.\\n\\nThe Moluccas were first explored by Europeans in 1512. Portuguese explorer João da Nova discovered Ternate and Tidore, while Dutch explorer Jacob Roggeveen discovered Halmahera and Bougainville. The Moluccas were later visited by Spanish explorer Ferdinand Magellan in 1521. British explorer James Cook visited Ternate, Tidore, Halmahera, and Bougainville during his first voyage in 1770–71.']\n", "\n", "prompt= [\"200 word wikipedia style introduction on 'Moluccans'\\n Moluccans are the Austronesian-speaking and Papuan-speaking ethnic\"]\n", "\n", "generated_text= [' groups inhabiting the Maluku Islands. The term \"Moluccan\" is an umbrella term that covers the various Austronesian and Papuan languages spoken on the islands. The largest group of Moluccans are the Tolo-speaking people.\\n\\nThe Maluku Islands are a group of volcanic islands in eastern Indonesia, located about 1,000 kilometres east of Java and 2,000 kilometres south of New Guinea. The islands comprise over 700 islands, with a total land area of approximately 245,000 square kilometres. They have a population of around 1 million people, most of whom are Muslim. The largest island group is the Tolo-speaking region, which comprises over 60% of the population.\\n\\nThe Moluccas were first explored by Europeans in 1512. Portuguese explorer João da Nova discovered Ternate and Tidore, while Dutch explorer Jacob Roggeveen discovered Halmahera and Bougainville. The Moluccas were later visited by Spanish explorer Ferdinand Magellan in 1521. British explorer James Cook visited Ternate, Tidore, Halmahera, and Bougainville during his first voyage in 1770–71.']\n" ] } ], "source": [ "print(f\"title= {df_0.title.values}\")\n", "print()\n", "print(f\"wiki_intro= {df_0.wiki_intro.values}\")\n", "print()\n", "print(f\"generated_intro= {df_0.generated_intro.values}\") # prompt {7 tokens} + generated_text\n", "print()\n", "print(f\"prompt= {df_0.prompt.values}\")\n", "print()\n", "print(f\"generated_text= {df_0.generated_text.values}\")\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station b...0
1In Finnish folklore, all places and things, an...0
2In mathematics, specifically differential calc...0
3is a Japanese shōjo manga series written and i...0
4Robert Milner \"Rob\" Bradley, Jr. (born August ...0
.........
149995Randy Borum is a Professor and Coordinator of ...0
149996Sa'och (, also, \"Sauch\") is an endangered, nea...0
149997Philip C. Hanawalt (born 1931) is an American ...0
149998Vossius Gymnasium is a public gymnasium in Ams...0
149999Simone Stratigo (, Symeon Filippos Stratigos; ...0
\n", "

150000 rows × 2 columns

\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station b... 0\n", "1 In Finnish folklore, all places and things, an... 0\n", "2 In mathematics, specifically differential calc... 0\n", "3 is a Japanese shōjo manga series written and i... 0\n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... 0\n", "... ... ...\n", "149995 Randy Borum is a Professor and Coordinator of ... 0\n", "149996 Sa'och (, also, \"Sauch\") is an endangered, nea... 0\n", "149997 Philip C. Hanawalt (born 1931) is an American ... 0\n", "149998 Vossius Gymnasium is a public gymnasium in Ams... 0\n", "149999 Simone Stratigo (, Symeon Filippos Stratigos; ... 0\n", "\n", "[150000 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_human= pd.DataFrame({\n", " \"Paragraph\": df.wiki_intro, \n", " \"AI_generated\": pd.Series([0]*len(df))\n", " \n", " }\n", ")\n", "new_df_human" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station l...1
1In Finnish folklore, all places and things, an...1
2In mathematics, specifically differential calc...1
3is a Japanese shōjo manga series written and i...1
4Robert Milner \"Rob\" Bradley, Jr. (born August ...1
.........
149995Randy Borum is a Professor and Coordinator of ...1
149996Sa'och (, also, \"Sauch\") is an endangered, nuc...1
149997Philip C. Hanawalt (born 1931) is an American ...1
149998Vossius Gymnasium is a public gymnasium in the...1
149999Simone Stratigo (, Symeon Filippos Stratigos; ...1
\n", "

150000 rows × 2 columns

\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station l... 1\n", "1 In Finnish folklore, all places and things, an... 1\n", "2 In mathematics, specifically differential calc... 1\n", "3 is a Japanese shōjo manga series written and i... 1\n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... 1\n", "... ... ...\n", "149995 Randy Borum is a Professor and Coordinator of ... 1\n", "149996 Sa'och (, also, \"Sauch\") is an endangered, nuc... 1\n", "149997 Philip C. Hanawalt (born 1931) is an American ... 1\n", "149998 Vossius Gymnasium is a public gymnasium in the... 1\n", "149999 Simone Stratigo (, Symeon Filippos Stratigos; ... 1\n", "\n", "[150000 rows x 2 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_ai= pd.DataFrame({\n", " \"Paragraph\": df.generated_intro, \n", " \"AI_generated\": pd.Series([1]*len(df))\n", " \n", " }\n", ")\n", "new_df_ai" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([\"In Finnish folklore, all places and things, and also human beings, have a haltija (a genius, guardian spirit) of their own. One such haltija is called etiäinen—an image, doppelgänger, or just an impression that goes ahead of a person, doing things the person in question later does. For example, people waiting at home might hear the door close or even see a shadow or a silhouette, only to realize that no one has yet arrived. Etiäinen can also refer to some kind of a feeling that something is going to happen. Sometimes it could, for example, warn of a bad year coming. In modern Finnish, the term has detached from its shamanistic origins and refers to premonition. Unlike clairvoyance, divination, and similar practices, etiäiset (plural) are spontaneous and can't be induced. Quite the opposite, they may be unwanted and cause anxiety, like ghosts. Etiäiset need not be too dramatic and may concern everyday events, although ones related to e.g. deaths are common. As these phenomena are still reported today, they can be considered a living tradition, as a way to explain the psychological experience of premonition.\"],\n", " dtype=object)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_human.iloc[[1]].Paragraph.values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['In Finnish folklore, all places and things, animate or inanimate, have a spirit or \"etiäinen\" that lives there. Etiäinen can manifest in many forms, but is usually described as a kind, elderly woman with white hair. She is the guardian of natural places and often helps people in need. \\n\\nEtiäinen has been a part of Finnish culture for centuries and is still widely believed in today. Folklorists study etiäinen to understand Finnish traditions and how they have changed over time.'],\n", " dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_ai.iloc[[1]].Paragraph.values\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station l...1
1In Finnish folklore, all places and things, an...1
2In mathematics, specifically differential calc...1
3is a Japanese shōjo manga series written and i...1
4Robert Milner \"Rob\" Bradley, Jr. (born August ...1
.........
149995Randy Borum is a Professor and Coordinator of ...0
149996Sa'och (, also, \"Sauch\") is an endangered, nea...0
149997Philip C. Hanawalt (born 1931) is an American ...0
149998Vossius Gymnasium is a public gymnasium in Ams...0
149999Simone Stratigo (, Symeon Filippos Stratigos; ...0
\n", "

300000 rows × 2 columns

\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station l... 1\n", "1 In Finnish folklore, all places and things, an... 1\n", "2 In mathematics, specifically differential calc... 1\n", "3 is a Japanese shōjo manga series written and i... 1\n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... 1\n", "... ... ...\n", "149995 Randy Borum is a Professor and Coordinator of ... 0\n", "149996 Sa'och (, also, \"Sauch\") is an endangered, nea... 0\n", "149997 Philip C. Hanawalt (born 1931) is an American ... 0\n", "149998 Vossius Gymnasium is a public gymnasium in Ams... 0\n", "149999 Simone Stratigo (, Symeon Filippos Stratigos; ... 0\n", "\n", "[300000 rows x 2 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df= pd.concat([new_df_ai, new_df_human])\n", "concat_df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station l...1
\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station l... 1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.iloc[[0000]]\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station b...0
\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station b... 0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.iloc[[150000]]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Sexhow railway station was a railway station l...1
1In Finnish folklore, all places and things, an...1
2In mathematics, specifically differential calc...1
3is a Japanese shōjo manga series written and i...1
4Robert Milner \"Rob\" Bradley, Jr. (born August ...1
.........
149995Randy Borum is a Professor and Coordinator of ...0
149996Sa'och (, also, \"Sauch\") is an endangered, nea...0
149997Philip C. Hanawalt (born 1931) is an American ...0
149998Vossius Gymnasium is a public gymnasium in Ams...0
149999Simone Stratigo (, Symeon Filippos Stratigos; ...0
\n", "

300000 rows × 2 columns

\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Sexhow railway station was a railway station l... 1\n", "1 In Finnish folklore, all places and things, an... 1\n", "2 In mathematics, specifically differential calc... 1\n", "3 is a Japanese shōjo manga series written and i... 1\n", "4 Robert Milner \"Rob\" Bradley, Jr. (born August ... 1\n", "... ... ...\n", "149995 Randy Borum is a Professor and Coordinator of ... 0\n", "149996 Sa'och (, also, \"Sauch\") is an endangered, nea... 0\n", "149997 Philip C. Hanawalt (born 1931) is an American ... 0\n", "149998 Vossius Gymnasium is a public gymnasium in Ams... 0\n", "149999 Simone Stratigo (, Symeon Filippos Stratigos; ... 0\n", "\n", "[300000 rows x 2 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ParagraphAI_generated
0Bentivoglio is an Italian surname. Notable peo...0
1Stephanie McCallum (born Sydney, Australia, 3...0
2Abdāl lit: substitutes, but which can also be...1
3The Gadget is a young adult historical novel w...1
4The A-1 lifeboat was a powered lifeboat that w...0
.........
299995James Wrighten (b. 1745 - d. 1793) was an Amer...1
299996The U-matrix (unified distance matrix) is a sy...1
299997Holding Trevor is a 2007 American gay-themed p...1
299998Jarvisfield is a heritage-listed former pastor...0
299999The Silver Guardian is a Chinese web series c...1
\n", "

300000 rows × 2 columns

\n", "
" ], "text/plain": [ " Paragraph AI_generated\n", "0 Bentivoglio is an Italian surname. Notable peo... 0\n", "1 Stephanie McCallum (born Sydney, Australia, 3... 0\n", "2 Abdāl lit: substitutes, but which can also be... 1\n", "3 The Gadget is a young adult historical novel w... 1\n", "4 The A-1 lifeboat was a powered lifeboat that w... 0\n", "... ... ...\n", "299995 James Wrighten (b. 1745 - d. 1793) was an Amer... 1\n", "299996 The U-matrix (unified distance matrix) is a sy... 1\n", "299997 Holding Trevor is a 2007 American gay-themed p... 1\n", "299998 Jarvisfield is a heritage-listed former pastor... 0\n", "299999 The Silver Guardian is a Chinese web series c... 1\n", "\n", "[300000 rows x 2 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cdf_shffeled" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "cdf_shffeled.to_csv(\"data/AI_checker_remade.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }