Ubuntu commited on
Commit
ad4c3bb
1 Parent(s): 2fb2dc0

added wikigpt

Browse files
.gitignore CHANGED
@@ -1,2 +1,2 @@
1
  keys.py
2
- wiki_gpt/
 
1
  keys.py
2
+ # wiki_gpt/
gpt3_finetuned_model/runs/Oct09_08-16-50_ip-172-31-95-165/events.out.tfevents.1696839410.ip-172-31-95-165.74908.0 CHANGED
Binary files a/gpt3_finetuned_model/runs/Oct09_08-16-50_ip-172-31-95-165/events.out.tfevents.1696839410.ip-172-31-95-165.74908.0 and b/gpt3_finetuned_model/runs/Oct09_08-16-50_ip-172-31-95-165/events.out.tfevents.1696839410.ip-172-31-95-165.74908.0 differ
 
wiki_gpt/.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ venv
2
+ wiki_intro.csv
3
+ wiki_intro_processed.csv
4
+ config.py
5
+ data/*
6
+ __pycache__/*
7
+ _archive/*
8
+ # GPT-wiki-intro.csv
9
+ GPT-wiki-intro.csv.zip
wiki_gpt/README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Wiki GPT
2
+
3
+ Code to generate [GPT-wiki-intro dataset](https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro#overview).
4
+ This dataset can be used to train and evaluate mdoels to classify text as human written or GPT generated.
5
+
6
+ To run the code -
7
+ 1. Install requirements from requirements.txt
8
+ 2. Create config.py and add your `OPENAI_API_KEY`
9
+ 3. Create a `data` folder
10
+ 4. Run `process_wiki_data.ipynb`
11
+ 5. Run `download_openai.ipynb`
12
+ 6. Run `process_results.ipynb`
13
+
14
+ ## Citation
15
+
16
+ Please cite as following
17
+
18
+ ```
19
+ @misc {aaditya_bhat_2023,
20
+ author = { {Aaditya Bhat} },
21
+ title = { GPT-wiki-intro (Revision 0e458f5) },
22
+ year = 2023,
23
+ url = { https://huggingface.co/datasets/aadityaubhat/GPT-wiki-intro },
24
+ doi = { 10.57967/hf/0326 },
25
+ publisher = { Hugging Face }
26
+ }
27
+ ```
wiki_gpt/download_openai.ipynb ADDED
@@ -0,0 +1,309 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 25,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pickle\n",
10
+ "from tqdm import tqdm\n",
11
+ "import pandas as pd\n",
12
+ "import re\n",
13
+ "import openai\n",
14
+ "from config import OPENAI_API_KEY\n",
15
+ "import dask"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 55,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "openai.api_key = OPENAI_API_KEY"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 8,
30
+ "metadata": {},
31
+ "outputs": [],
32
+ "source": [
33
+ "data_df = pd.read_csv('wiki_intro_processed.csv')"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 44,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "def chunkify(lst, n):\n",
43
+ " return [lst[i:i + n] for i in range(0, len(lst), n)]"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 5,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "def create_prompt(title, starter_text):\n",
53
+ " return f'''200 word wikipedia style introduction on '{title}'\n",
54
+ " {starter_text}'''"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 33,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "def get_openai_response(dct): \n",
64
+ " title = dct['title']\n",
65
+ " starter_text = dct['starter_text']\n",
66
+ "\n",
67
+ " prompt = create_prompt(title, starter_text)\n",
68
+ "\n",
69
+ " return {title: openai.Completion.create(\n",
70
+ " model=\"text-curie-001\",\n",
71
+ " prompt=prompt,\n",
72
+ " temperature=0.7,\n",
73
+ " max_tokens=300,\n",
74
+ " top_p=1,\n",
75
+ " frequency_penalty=0.4,\n",
76
+ " presence_penalty=0.1\n",
77
+ " )}"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "code",
82
+ "execution_count": 35,
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "def fake_get_openai_response(dct):\n",
87
+ " title = dct['title']\n",
88
+ " starter_text = dct['starter_text']\n",
89
+ "\n",
90
+ " prompt = create_prompt(title, starter_text)\n",
91
+ "\n",
92
+ " return {title: f'{prompt} blah blah blah'}"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 54,
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "def run_dask(lst_dct):\n",
102
+ " delayed_calls = [dask.delayed(get_openai_response)(dct) for dct in lst_dct]\n",
103
+ " results = dask.compute(*delayed_calls)\n",
104
+ " return results"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 70,
110
+ "metadata": {},
111
+ "outputs": [
112
+ {
113
+ "name": "stdout",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "1\n",
117
+ "2\n",
118
+ "3\n",
119
+ "4\n",
120
+ "5\n",
121
+ "6\n",
122
+ "7\n",
123
+ "8\n",
124
+ "9\n",
125
+ "10\n",
126
+ "11\n",
127
+ "12\n",
128
+ "13\n",
129
+ "14\n",
130
+ "15\n",
131
+ "16\n",
132
+ "17\n",
133
+ "18\n",
134
+ "19\n",
135
+ "20\n",
136
+ "21\n",
137
+ "22\n",
138
+ "23\n",
139
+ "24\n",
140
+ "25\n",
141
+ "26\n",
142
+ "27\n",
143
+ "28\n",
144
+ "29\n",
145
+ "30\n",
146
+ "31\n",
147
+ "32\n",
148
+ "33\n",
149
+ "34\n",
150
+ "35\n",
151
+ "36\n",
152
+ "37\n",
153
+ "38\n",
154
+ "39\n",
155
+ "40\n",
156
+ "41\n",
157
+ "42\n",
158
+ "43\n",
159
+ "44\n",
160
+ "45\n",
161
+ "46\n",
162
+ "47\n",
163
+ "48\n",
164
+ "49\n",
165
+ "50\n",
166
+ "51\n",
167
+ "52\n",
168
+ "53\n",
169
+ "54\n",
170
+ "55\n",
171
+ "56\n",
172
+ "57\n",
173
+ "58\n",
174
+ "59\n",
175
+ "60\n",
176
+ "61\n",
177
+ "62\n",
178
+ "63\n",
179
+ "64\n",
180
+ "65\n",
181
+ "66\n",
182
+ "67\n",
183
+ "68\n",
184
+ "69\n",
185
+ "70\n",
186
+ "71\n",
187
+ "72\n",
188
+ "73\n",
189
+ "74\n",
190
+ "75\n",
191
+ "76\n",
192
+ "77\n",
193
+ "78\n",
194
+ "79\n",
195
+ "80\n",
196
+ "81\n",
197
+ "82\n",
198
+ "83\n",
199
+ "84\n",
200
+ "85\n",
201
+ "86\n",
202
+ "87\n",
203
+ "88\n",
204
+ "89\n",
205
+ "90\n",
206
+ "91\n",
207
+ "92\n",
208
+ "93\n",
209
+ "94\n",
210
+ "95\n",
211
+ "96\n",
212
+ "97\n",
213
+ "98\n",
214
+ "99\n",
215
+ "100\n",
216
+ "101\n",
217
+ "102\n",
218
+ "103\n",
219
+ "104\n",
220
+ "105\n",
221
+ "106\n",
222
+ "107\n",
223
+ "108\n",
224
+ "109\n",
225
+ "110\n",
226
+ "111\n",
227
+ "112\n",
228
+ "113\n",
229
+ "114\n",
230
+ "115\n",
231
+ "116\n",
232
+ "117\n",
233
+ "118\n",
234
+ "119\n",
235
+ "120\n",
236
+ "121\n",
237
+ "122\n",
238
+ "123\n",
239
+ "124\n",
240
+ "125\n",
241
+ "126\n",
242
+ "127\n",
243
+ "128\n",
244
+ "129\n",
245
+ "130\n",
246
+ "131\n",
247
+ "132\n",
248
+ "133\n",
249
+ "134\n",
250
+ "135\n",
251
+ "136\n",
252
+ "137\n",
253
+ "138\n",
254
+ "139\n",
255
+ "140\n",
256
+ "141\n",
257
+ "142\n",
258
+ "143\n",
259
+ "144\n",
260
+ "145\n",
261
+ "146\n",
262
+ "147\n",
263
+ "148\n",
264
+ "149\n"
265
+ ]
266
+ }
267
+ ],
268
+ "source": [
269
+ "chunk_n = 0\n",
270
+ "for chunk in chunkify(data_df[['title', 'starter_text']].to_dict('records'), 1000):\n",
271
+ " if chunk_n == 0:\n",
272
+ " chunk_n += 1\n",
273
+ " continue\n",
274
+ " print(chunk_n)\n",
275
+ " result = run_dask(chunk)\n",
276
+ " with open(f'data/result-{chunk_n}.pkl', 'wb') as file:\n",
277
+ " pickle.dump(result, file)\n",
278
+ " chunk_n += 1\n"
279
+ ]
280
+ }
281
+ ],
282
+ "metadata": {
283
+ "kernelspec": {
284
+ "display_name": "venv",
285
+ "language": "python",
286
+ "name": "python3"
287
+ },
288
+ "language_info": {
289
+ "codemirror_mode": {
290
+ "name": "ipython",
291
+ "version": 3
292
+ },
293
+ "file_extension": ".py",
294
+ "mimetype": "text/x-python",
295
+ "name": "python",
296
+ "nbconvert_exporter": "python",
297
+ "pygments_lexer": "ipython3",
298
+ "version": "3.10.6 (main, Aug 30 2022, 05:12:36) [Clang 13.1.6 (clang-1316.0.21.2.5)]"
299
+ },
300
+ "orig_nbformat": 4,
301
+ "vscode": {
302
+ "interpreter": {
303
+ "hash": "3f100d68d9cf80676b1a4c3ace5430b03ae266a1d88e3f101eb196b64b263632"
304
+ }
305
+ }
306
+ },
307
+ "nbformat": 4,
308
+ "nbformat_minor": 2
309
+ }
wiki_gpt/process_results.ipynb ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import pandas as pd\n",
10
+ "import pickle\n",
11
+ "import os; os.chdir('..')"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "data_df = pd.read_csv('wiki_intro_processed.csv')"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 18,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "def create_prompt(title, starter_text):\n",
30
+ " return f'''200 word wikipedia style introduction on '{title}'\n",
31
+ " {starter_text}'''"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": 4,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "results = []\n",
41
+ "\n",
42
+ "for i in range(150):\n",
43
+ " with open(f'data/result-{i}.pkl', 'rb') as file:\n",
44
+ " temp = pickle.load(file)\n",
45
+ " results += list(temp)"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": 5,
51
+ "metadata": {},
52
+ "outputs": [
53
+ {
54
+ "data": {
55
+ "text/plain": [
56
+ "150000"
57
+ ]
58
+ },
59
+ "execution_count": 5,
60
+ "metadata": {},
61
+ "output_type": "execute_result"
62
+ }
63
+ ],
64
+ "source": [
65
+ "len(results)"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "code",
70
+ "execution_count": 9,
71
+ "metadata": {},
72
+ "outputs": [],
73
+ "source": [
74
+ "processed_results = []\n",
75
+ "for dct in results:\n",
76
+ " for key in dct:\n",
77
+ " processed_results.append({\n",
78
+ " 'title': key, \n",
79
+ " 'generated_text': dct[key]['choices'][0]['text'],\n",
80
+ " 'prompt_tokens' : dct[key]['usage']['prompt_tokens'],\n",
81
+ " 'completion_tokens' : dct[key]['usage']['completion_tokens'],\n",
82
+ " })"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 12,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "processed_results_df = pd.DataFrame(processed_results)"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 16,
97
+ "metadata": {},
98
+ "outputs": [],
99
+ "source": [
100
+ "# Create final df\n",
101
+ "final_df = pd.merge(data_df, processed_results_df, on=['title'])"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 19,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "# Create new columns\n",
111
+ "final_df['prompt'] = final_df.apply(lambda row: create_prompt(row['title'], row['starter_text']), axis=1)\n",
112
+ "\n",
113
+ "final_df['generated_text_complete'] = final_df['starter_text'] + final_df['generated_text']\n",
114
+ "\n",
115
+ "final_df['generated_text_len'] = final_df['generated_text_complete'].apply(lambda x: len(x.split(' ')))"
116
+ ]
117
+ },
118
+ {
119
+ "cell_type": "code",
120
+ "execution_count": 38,
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "# Rename columns\n",
125
+ "final_df = final_df.rename(columns={\n",
126
+ " 'intro': 'wiki_intro', 'intro_len': 'wiki_intro_len',\n",
127
+ " 'generated_text_complete': 'generated_intro', 'generated_text_len' : 'generated_intro_len'})"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": 40,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "# Reorder columns\n",
137
+ "final_df = final_df[[ \n",
138
+ " 'id', 'url', 'title', 'wiki_intro', 'generated_intro', 'title_len',\n",
139
+ " 'wiki_intro_len', 'generated_intro_len', 'prompt', 'generated_text',\n",
140
+ " 'prompt_tokens', 'generated_text_tokens']]"
141
+ ]
142
+ },
143
+ {
144
+ "cell_type": "code",
145
+ "execution_count": 42,
146
+ "metadata": {},
147
+ "outputs": [],
148
+ "source": [
149
+ "# Write csv file\n",
150
+ "final_df.to_csv('GPT-wiki-intro.csv', index=False)"
151
+ ]
152
+ }
153
+ ],
154
+ "metadata": {
155
+ "kernelspec": {
156
+ "display_name": "venv",
157
+ "language": "python",
158
+ "name": "python3"
159
+ },
160
+ "language_info": {
161
+ "codemirror_mode": {
162
+ "name": "ipython",
163
+ "version": 3
164
+ },
165
+ "file_extension": ".py",
166
+ "mimetype": "text/x-python",
167
+ "name": "python",
168
+ "nbconvert_exporter": "python",
169
+ "pygments_lexer": "ipython3",
170
+ "version": "3.10.12"
171
+ },
172
+ "orig_nbformat": 4,
173
+ "vscode": {
174
+ "interpreter": {
175
+ "hash": "3f100d68d9cf80676b1a4c3ace5430b03ae266a1d88e3f101eb196b64b263632"
176
+ }
177
+ }
178
+ },
179
+ "nbformat": 4,
180
+ "nbformat_minor": 2
181
+ }
wiki_gpt/process_wiki_data.ipynb ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os; os.chdir('..')"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 5,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "# pip install apache-beam==2.43.0\n",
19
+ "# pip install --no-deps multiprocess==0.70.14"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 6,
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "ename": "ImportError",
29
+ "evalue": "cannot import name 'get_context' from 'multiprocess' (/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/multiprocess/__init__.py)",
30
+ "output_type": "error",
31
+ "traceback": [
32
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
33
+ "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
34
+ "\u001b[1;32m/home/ubuntu/SentenceStructureComparision/wiki_gpt/process_wiki_data.ipynb Cell 3\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/wiki_gpt/process_wiki_data.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mdatasets\u001b[39;00m \u001b[39mimport\u001b[39;00m load_dataset\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/wiki_gpt/process_wiki_data.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtqdm\u001b[39;00m \u001b[39mimport\u001b[39;00m tqdm\n\u001b[1;32m <a href='vscode-notebook-cell://ssh-remote%2B7b22686f73744e616d65223a22456d62656464696e6773227d/home/ubuntu/SentenceStructureComparision/wiki_gpt/process_wiki_data.ipynb#W1sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n",
35
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/__init__.py:22\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39m# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[39m# pylint: enable=line-too-long\u001b[39;00m\n\u001b[1;32m 18\u001b[0m \u001b[39m# pylint: disable=g-import-not-at-top,g-bad-import-order,wrong-import-position\u001b[39;00m\n\u001b[1;32m 20\u001b[0m __version__ \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m2.14.5\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 22\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39marrow_dataset\u001b[39;00m \u001b[39mimport\u001b[39;00m Dataset\n\u001b[1;32m 23\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39marrow_reader\u001b[39;00m \u001b[39mimport\u001b[39;00m ReadInstruction\n\u001b[1;32m 24\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mbuilder\u001b[39;00m \u001b[39mimport\u001b[39;00m ArrowBasedBuilder, BeamBasedBuilder, BuilderConfig, DatasetBuilder, GeneratorBasedBuilder\n",
36
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/arrow_dataset.py:67\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m config\n\u001b[1;32m 66\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39marrow_reader\u001b[39;00m \u001b[39mimport\u001b[39;00m ArrowReader\n\u001b[0;32m---> 67\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39marrow_writer\u001b[39;00m \u001b[39mimport\u001b[39;00m ArrowWriter, OptimizedTypedSequence\n\u001b[1;32m 68\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mdata_files\u001b[39;00m \u001b[39mimport\u001b[39;00m sanitize_patterns\n\u001b[1;32m 69\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mdownload\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdownload_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DownloadConfig\n",
37
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/arrow_writer.py:27\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpyarrow\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mparquet\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpq\u001b[39;00m\n\u001b[1;32m 26\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m config\n\u001b[0;32m---> 27\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mfeatures\u001b[39;00m \u001b[39mimport\u001b[39;00m Features, Image, Value\n\u001b[1;32m 28\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mfeatures\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mfeatures\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 29\u001b[0m FeatureType,\n\u001b[1;32m 30\u001b[0m _ArrayXDExtensionType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 36\u001b[0m to_pyarrow_listarray,\n\u001b[1;32m 37\u001b[0m )\n\u001b[1;32m 38\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mfilesystems\u001b[39;00m \u001b[39mimport\u001b[39;00m is_remote_filesystem\n",
38
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/features/__init__.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# flake8: noqa\u001b[39;00m\n\u001b[1;32m 3\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m 4\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mAudio\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mArray2D\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mTranslationVariableLanguages\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 16\u001b[0m ]\n\u001b[0;32m---> 17\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39maudio\u001b[39;00m \u001b[39mimport\u001b[39;00m Audio\n\u001b[1;32m 18\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mfeatures\u001b[39;00m \u001b[39mimport\u001b[39;00m Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, Sequence, Value\n\u001b[1;32m 19\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mimage\u001b[39;00m \u001b[39mimport\u001b[39;00m Image\n",
39
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/features/audio.py:11\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m config\n\u001b[1;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdownload\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdownload_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DownloadConfig\n\u001b[0;32m---> 11\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdownload\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mstreaming_download_manager\u001b[39;00m \u001b[39mimport\u001b[39;00m xopen, xsplitext\n\u001b[1;32m 12\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtable\u001b[39;00m \u001b[39mimport\u001b[39;00m array_cast\n\u001b[1;32m 13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpy_utils\u001b[39;00m \u001b[39mimport\u001b[39;00m no_op_if_value_is_null, string_to_dict\n",
40
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/download/__init__.py:9\u001b[0m\n\u001b[1;32m 1\u001b[0m __all__ \u001b[39m=\u001b[39m [\n\u001b[1;32m 2\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mDownloadConfig\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mDownloadManager\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 4\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mDownloadMode\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mStreamingDownloadManager\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m ]\n\u001b[1;32m 8\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mdownload_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DownloadConfig\n\u001b[0;32m----> 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mdownload_manager\u001b[39;00m \u001b[39mimport\u001b[39;00m DownloadManager, DownloadMode\n\u001b[1;32m 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mstreaming_download_manager\u001b[39;00m \u001b[39mimport\u001b[39;00m StreamingDownloadManager\n",
41
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/download/download_manager.py:35\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39minfo_utils\u001b[39;00m \u001b[39mimport\u001b[39;00m get_size_checksum_dict\n\u001b[1;32m 34\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mlogging\u001b[39;00m \u001b[39mimport\u001b[39;00m get_logger, is_progress_bar_enabled, tqdm\n\u001b[0;32m---> 35\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mutils\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpy_utils\u001b[39;00m \u001b[39mimport\u001b[39;00m NestedDataStructure, map_nested, size_str\n\u001b[1;32m 36\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mdownload_config\u001b[39;00m \u001b[39mimport\u001b[39;00m DownloadConfig\n\u001b[1;32m 39\u001b[0m logger \u001b[39m=\u001b[39m get_logger(\u001b[39m__name__\u001b[39m)\n",
42
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/datasets/utils/py_utils.py:41\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mdill\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmultiprocess\u001b[39;00m\n\u001b[0;32m---> 41\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mmultiprocess\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpool\u001b[39;00m\n\u001b[1;32m 42\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnumpy\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mnp\u001b[39;00m\n\u001b[1;32m 43\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpackaging\u001b[39;00m \u001b[39mimport\u001b[39;00m version\n",
43
+ "File \u001b[0;32m~/SentenceStructureComparision/venv/lib/python3.10/site-packages/multiprocess/pool.py:29\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[39m# If threading is available then ThreadPool should be provided. Therefore\u001b[39;00m\n\u001b[1;32m 27\u001b[0m \u001b[39m# we avoid top-level imports which are liable to fail on some systems.\u001b[39;00m\n\u001b[1;32m 28\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m util\n\u001b[0;32m---> 29\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m \u001b[39mimport\u001b[39;00m get_context, \u001b[39mTimeoutError\u001b[39;00m\n\u001b[1;32m 30\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mconnection\u001b[39;00m \u001b[39mimport\u001b[39;00m wait\n\u001b[1;32m 32\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[1;32m 33\u001b[0m \u001b[39m# Constants representing the state of a pool\u001b[39;00m\n\u001b[1;32m 34\u001b[0m \u001b[39m#\u001b[39;00m\n",
44
+ "\u001b[0;31mImportError\u001b[0m: cannot import name 'get_context' from 'multiprocess' (/home/ubuntu/SentenceStructureComparision/venv/lib/python3.10/site-packages/multiprocess/__init__.py)"
45
+ ]
46
+ }
47
+ ],
48
+ "source": [
49
+ "from datasets import load_dataset\n",
50
+ "from tqdm import tqdm\n",
51
+ "import pandas as pd\n",
52
+ "import pyarrow as pa\n",
53
+ "import pyarrow.compute as pc\n",
54
+ "import re\n",
55
+ "import pyarrow.dataset as ds"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "name": "stderr",
65
+ "output_type": "stream",
66
+ "text": [
67
+ "Downloading: 100%|██████████| 15.3k/15.3k [00:00<00:00, 19.5MB/s]\n",
68
+ "Downloading: 3%|▎ | 703M/20.3G [00:11<05:19, 61.2MB/s] "
69
+ ]
70
+ }
71
+ ],
72
+ "source": [
73
+ "dataset = load_dataset(\"wikipedia\", \"20220301.en\", split=\"train\")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 3,
79
+ "metadata": {},
80
+ "outputs": [],
81
+ "source": [
82
+ "data_df = dataset.to_pandas()"
83
+ ]
84
+ },
85
+ {
86
+ "cell_type": "code",
87
+ "execution_count": 6,
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "def keep_intro(text):\n",
92
+ " return re.sub(r'\\(\\W*\\)', '', ' '.join(text.split('\\n\\n')[:2]))"
93
+ ]
94
+ },
95
+ {
96
+ "cell_type": "code",
97
+ "execution_count": 7,
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "data_df['intro'] = data_df.apply(lambda row: keep_intro(row['text']), axis=1)"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 9,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "data_df[['id', 'title', 'url', 'intro']].to_csv('wiki_intro.csv', index=False)"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 3,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "data_df = pd.read_csv('wiki_intro.csv')"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 4,
125
+ "metadata": {},
126
+ "outputs": [
127
+ {
128
+ "data": {
129
+ "text/plain": [
130
+ "6458670"
131
+ ]
132
+ },
133
+ "execution_count": 4,
134
+ "metadata": {},
135
+ "output_type": "execute_result"
136
+ }
137
+ ],
138
+ "source": [
139
+ "len(data_df)"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": 5,
145
+ "metadata": {},
146
+ "outputs": [
147
+ {
148
+ "data": {
149
+ "text/html": [
150
+ "<div>\n",
151
+ "<style scoped>\n",
152
+ " .dataframe tbody tr th:only-of-type {\n",
153
+ " vertical-align: middle;\n",
154
+ " }\n",
155
+ "\n",
156
+ " .dataframe tbody tr th {\n",
157
+ " vertical-align: top;\n",
158
+ " }\n",
159
+ "\n",
160
+ " .dataframe thead th {\n",
161
+ " text-align: right;\n",
162
+ " }\n",
163
+ "</style>\n",
164
+ "<table border=\"1\" class=\"dataframe\">\n",
165
+ " <thead>\n",
166
+ " <tr style=\"text-align: right;\">\n",
167
+ " <th></th>\n",
168
+ " <th>id</th>\n",
169
+ " <th>title</th>\n",
170
+ " <th>url</th>\n",
171
+ " <th>intro</th>\n",
172
+ " </tr>\n",
173
+ " </thead>\n",
174
+ " <tbody>\n",
175
+ " <tr>\n",
176
+ " <th>0</th>\n",
177
+ " <td>12</td>\n",
178
+ " <td>Anarchism</td>\n",
179
+ " <td>https://en.wikipedia.org/wiki/Anarchism</td>\n",
180
+ " <td>Anarchism is a political philosophy and moveme...</td>\n",
181
+ " </tr>\n",
182
+ " <tr>\n",
183
+ " <th>1</th>\n",
184
+ " <td>25</td>\n",
185
+ " <td>Autism</td>\n",
186
+ " <td>https://en.wikipedia.org/wiki/Autism</td>\n",
187
+ " <td>Autism is a neurodevelopmental disorder charac...</td>\n",
188
+ " </tr>\n",
189
+ " <tr>\n",
190
+ " <th>2</th>\n",
191
+ " <td>39</td>\n",
192
+ " <td>Albedo</td>\n",
193
+ " <td>https://en.wikipedia.org/wiki/Albedo</td>\n",
194
+ " <td>Albedo is the measure of the diffuse reflecti...</td>\n",
195
+ " </tr>\n",
196
+ " <tr>\n",
197
+ " <th>3</th>\n",
198
+ " <td>290</td>\n",
199
+ " <td>A</td>\n",
200
+ " <td>https://en.wikipedia.org/wiki/A</td>\n",
201
+ " <td>A, or a, is the first letter and the first vow...</td>\n",
202
+ " </tr>\n",
203
+ " <tr>\n",
204
+ " <th>4</th>\n",
205
+ " <td>303</td>\n",
206
+ " <td>Alabama</td>\n",
207
+ " <td>https://en.wikipedia.org/wiki/Alabama</td>\n",
208
+ " <td>Alabama is a state in the Southeastern region...</td>\n",
209
+ " </tr>\n",
210
+ " <tr>\n",
211
+ " <th>...</th>\n",
212
+ " <td>...</td>\n",
213
+ " <td>...</td>\n",
214
+ " <td>...</td>\n",
215
+ " <td>...</td>\n",
216
+ " </tr>\n",
217
+ " <tr>\n",
218
+ " <th>6458665</th>\n",
219
+ " <td>70201819</td>\n",
220
+ " <td>Bianca Fernandez</td>\n",
221
+ " <td>https://en.wikipedia.org/wiki/Bianca%20Fernandez</td>\n",
222
+ " <td>Bianca Jolie Fernandez (born 24 February 2004)...</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>6458666</th>\n",
226
+ " <td>70201882</td>\n",
227
+ " <td>Condons and Clangibbon</td>\n",
228
+ " <td>https://en.wikipedia.org/wiki/Condons%20and%20...</td>\n",
229
+ " <td>Condons and Clangibbon is a barony in County ...</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>6458667</th>\n",
233
+ " <td>70201886</td>\n",
234
+ " <td>2022 Chattanooga Red Wolves SC season</td>\n",
235
+ " <td>https://en.wikipedia.org/wiki/2022%20Chattanoo...</td>\n",
236
+ " <td>The 2022 Chattanooga Red Wolves SC season will...</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>6458668</th>\n",
240
+ " <td>70201947</td>\n",
241
+ " <td>Nkiko Prosper</td>\n",
242
+ " <td>https://en.wikipedia.org/wiki/Nkiko%20Prosper</td>\n",
243
+ " <td>Turatsinze Nkiko Prosper (born 1985) professio...</td>\n",
244
+ " </tr>\n",
245
+ " <tr>\n",
246
+ " <th>6458669</th>\n",
247
+ " <td>70201959</td>\n",
248
+ " <td>Michael O'Donnell (Missouri politician)</td>\n",
249
+ " <td>https://en.wikipedia.org/wiki/Michael%20O%27Do...</td>\n",
250
+ " <td>Michael A. O'Donnell (born June 17, 1968) is a...</td>\n",
251
+ " </tr>\n",
252
+ " </tbody>\n",
253
+ "</table>\n",
254
+ "<p>6458670 rows × 4 columns</p>\n",
255
+ "</div>"
256
+ ],
257
+ "text/plain": [
258
+ " id title \\\n",
259
+ "0 12 Anarchism \n",
260
+ "1 25 Autism \n",
261
+ "2 39 Albedo \n",
262
+ "3 290 A \n",
263
+ "4 303 Alabama \n",
264
+ "... ... ... \n",
265
+ "6458665 70201819 Bianca Fernandez \n",
266
+ "6458666 70201882 Condons and Clangibbon \n",
267
+ "6458667 70201886 2022 Chattanooga Red Wolves SC season \n",
268
+ "6458668 70201947 Nkiko Prosper \n",
269
+ "6458669 70201959 Michael O'Donnell (Missouri politician) \n",
270
+ "\n",
271
+ " url \\\n",
272
+ "0 https://en.wikipedia.org/wiki/Anarchism \n",
273
+ "1 https://en.wikipedia.org/wiki/Autism \n",
274
+ "2 https://en.wikipedia.org/wiki/Albedo \n",
275
+ "3 https://en.wikipedia.org/wiki/A \n",
276
+ "4 https://en.wikipedia.org/wiki/Alabama \n",
277
+ "... ... \n",
278
+ "6458665 https://en.wikipedia.org/wiki/Bianca%20Fernandez \n",
279
+ "6458666 https://en.wikipedia.org/wiki/Condons%20and%20... \n",
280
+ "6458667 https://en.wikipedia.org/wiki/2022%20Chattanoo... \n",
281
+ "6458668 https://en.wikipedia.org/wiki/Nkiko%20Prosper \n",
282
+ "6458669 https://en.wikipedia.org/wiki/Michael%20O%27Do... \n",
283
+ "\n",
284
+ " intro \n",
285
+ "0 Anarchism is a political philosophy and moveme... \n",
286
+ "1 Autism is a neurodevelopmental disorder charac... \n",
287
+ "2 Albedo is the measure of the diffuse reflecti... \n",
288
+ "3 A, or a, is the first letter and the first vow... \n",
289
+ "4 Alabama is a state in the Southeastern region... \n",
290
+ "... ... \n",
291
+ "6458665 Bianca Jolie Fernandez (born 24 February 2004)... \n",
292
+ "6458666 Condons and Clangibbon is a barony in County ... \n",
293
+ "6458667 The 2022 Chattanooga Red Wolves SC season will... \n",
294
+ "6458668 Turatsinze Nkiko Prosper (born 1985) professio... \n",
295
+ "6458669 Michael A. O'Donnell (born June 17, 1968) is a... \n",
296
+ "\n",
297
+ "[6458670 rows x 4 columns]"
298
+ ]
299
+ },
300
+ "execution_count": 5,
301
+ "metadata": {},
302
+ "output_type": "execute_result"
303
+ }
304
+ ],
305
+ "source": [
306
+ "data_df"
307
+ ]
308
+ },
309
+ {
310
+ "attachments": {},
311
+ "cell_type": "markdown",
312
+ "metadata": {},
313
+ "source": [
314
+ "## Data Cleaning"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": 14,
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": [
323
+ "data_df = data_df[data_df.apply(lambda x : len(str(x['title'])) > 1, axis = 1)]"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 15,
329
+ "metadata": {},
330
+ "outputs": [
331
+ {
332
+ "name": "stderr",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "/var/folders/b5/qwcw21mn3973c1z_c12dw6hw0000gn/T/ipykernel_849/2014648341.py:1: SettingWithCopyWarning: \n",
336
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
337
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
338
+ "\n",
339
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
340
+ " data_df['intro_len'] = data_df.apply(lambda row: len(row['intro'].split(' ')), axis = 1)\n"
341
+ ]
342
+ }
343
+ ],
344
+ "source": [
345
+ "data_df['intro_len'] = data_df.apply(lambda row: len(row['intro'].split(' ')), axis = 1)"
346
+ ]
347
+ },
348
+ {
349
+ "cell_type": "code",
350
+ "execution_count": 16,
351
+ "metadata": {},
352
+ "outputs": [
353
+ {
354
+ "data": {
355
+ "text/plain": [
356
+ "<AxesSubplot: >"
357
+ ]
358
+ },
359
+ "execution_count": 16,
360
+ "metadata": {},
361
+ "output_type": "execute_result"
362
+ },
363
+ {
364
+ "data": {
365
+ "image/png": "",
366
+ "text/plain": [
367
+ "<Figure size 640x480 with 1 Axes>"
368
+ ]
369
+ },
370
+ "metadata": {},
371
+ "output_type": "display_data"
372
+ }
373
+ ],
374
+ "source": [
375
+ "data_df['intro_len'].hist(bins=[0,50,100,150,200,250,300,350,400])"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 20,
381
+ "metadata": {},
382
+ "outputs": [],
383
+ "source": [
384
+ "data_df = data_df[data_df['intro_len'].between(150, 350)]"
385
+ ]
386
+ },
387
+ {
388
+ "cell_type": "code",
389
+ "execution_count": 23,
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "data_df['title_len'] = data_df.apply(lambda row: len(row['title'].split(' ')), axis = 1)"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 26,
399
+ "metadata": {},
400
+ "outputs": [
401
+ {
402
+ "data": {
403
+ "text/plain": [
404
+ "<AxesSubplot: >"
405
+ ]
406
+ },
407
+ "execution_count": 26,
408
+ "metadata": {},
409
+ "output_type": "execute_result"
410
+ },
411
+ {
412
+ "data": {
413
+ "image/png": "",
414
+ "text/plain": [
415
+ "<Figure size 640x480 with 1 Axes>"
416
+ ]
417
+ },
418
+ "metadata": {},
419
+ "output_type": "display_data"
420
+ }
421
+ ],
422
+ "source": [
423
+ "data_df.title_len.hist(bins=[1,2,3,4,5,6,7,8,9,10])"
424
+ ]
425
+ },
426
+ {
427
+ "cell_type": "code",
428
+ "execution_count": 27,
429
+ "metadata": {},
430
+ "outputs": [],
431
+ "source": [
432
+ "data_df = data_df[data_df['title_len'] < 4]"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": 34,
438
+ "metadata": {},
439
+ "outputs": [],
440
+ "source": [
441
+ "data_df = data_df.sample(n=150000, random_state=1111)"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": 35,
447
+ "metadata": {},
448
+ "outputs": [
449
+ {
450
+ "data": {
451
+ "text/plain": [
452
+ "<AxesSubplot: >"
453
+ ]
454
+ },
455
+ "execution_count": 35,
456
+ "metadata": {},
457
+ "output_type": "execute_result"
458
+ },
459
+ {
460
+ "data": {
461
+ "image/png": "",
462
+ "text/plain": [
463
+ "<Figure size 640x480 with 1 Axes>"
464
+ ]
465
+ },
466
+ "metadata": {},
467
+ "output_type": "display_data"
468
+ }
469
+ ],
470
+ "source": [
471
+ "data_df['intro_len'].hist()"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": 4,
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": [
480
+ "data_df['starter_text'] = data_df['intro'].apply(lambda x: ' '.join(str(x).split(' ')[:7]))"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "code",
485
+ "execution_count": 6,
486
+ "metadata": {},
487
+ "outputs": [],
488
+ "source": [
489
+ "data_df.to_csv('data/wiki_intro_processed.csv', index=False)"
490
+ ]
491
+ }
492
+ ],
493
+ "metadata": {
494
+ "kernelspec": {
495
+ "display_name": "venv",
496
+ "language": "python",
497
+ "name": "python3"
498
+ },
499
+ "language_info": {
500
+ "codemirror_mode": {
501
+ "name": "ipython",
502
+ "version": 3
503
+ },
504
+ "file_extension": ".py",
505
+ "mimetype": "text/x-python",
506
+ "name": "python",
507
+ "nbconvert_exporter": "python",
508
+ "pygments_lexer": "ipython3",
509
+ "version": "3.10.12"
510
+ },
511
+ "orig_nbformat": 4,
512
+ "vscode": {
513
+ "interpreter": {
514
+ "hash": "3f100d68d9cf80676b1a4c3ace5430b03ae266a1d88e3f101eb196b64b263632"
515
+ }
516
+ }
517
+ },
518
+ "nbformat": 4,
519
+ "nbformat_minor": 2
520
+ }
wiki_gpt/requirements.txt ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp==3.8.3
2
+ aiosignal==1.3.1
3
+ apache-beam
4
+ appnope==0.1.3
5
+ asttokens==2.2.1
6
+ async-timeout==4.0.2
7
+ attrs==22.2.0
8
+ backcall==0.2.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==2.1.1
11
+ click==8.1.3
12
+ cloudpickle==2.2.1
13
+ comm==0.1.2
14
+ contourpy==1.0.7
15
+ crcmod==1.7
16
+ cycler==0.11.0
17
+ dask==2023.1.1
18
+ datasets
19
+ debugpy==1.6.6
20
+ decorator==5.1.1
21
+ dill
22
+ docopt==0.6.2
23
+ executing==1.2.0
24
+ fastavro==1.7.1
25
+ fasteners==0.18
26
+ filelock==3.9.0
27
+ fonttools==4.38.0
28
+ frozenlist==1.3.3
29
+ fsspec==2023.1.0
30
+ grpcio==1.51.1
31
+ hdfs==2.7.0
32
+ httplib2==0.20.4
33
+ huggingface-hub==0.12.0
34
+ idna==3.4
35
+ install==1.3.5
36
+ ipykernel==6.20.2
37
+ ipython==8.9.0
38
+ jedi==0.18.2
39
+ jupyter_client==8.0.2
40
+ jupyter_core==5.2.0
41
+ kiwisolver==1.4.4
42
+ locket==1.0.0
43
+ matplotlib==3.6.3
44
+ matplotlib-inline==0.1.6
45
+ multidict==6.0.4
46
+ multiprocess
47
+ mwparserfromhell==0.6.4
48
+ nest-asyncio==1.5.6
49
+ numpy==1.22.4
50
+ objsize==0.6.1
51
+ openai==0.26.4
52
+ orjson==3.8.5
53
+ packaging==23.0
54
+ pandas==1.5.3
55
+ parso==0.8.3
56
+ partd==1.3.0
57
+ pexpect==4.8.0
58
+ pickleshare==0.7.5
59
+ Pillow==9.4.0
60
+ pip==23.0
61
+ platformdirs==2.6.2
62
+ prompt-toolkit==3.0.36
63
+ proto-plus==1.22.2
64
+ protobuf==3.19.4
65
+ psutil==5.9.4
66
+ ptyprocess==0.7.0
67
+ pure-eval==0.2.2
68
+ pyarrow==9.0.0
69
+ pydot==1.4.2
70
+ Pygments==2.14.0
71
+ pymongo==3.13.0
72
+ pyparsing==3.0.9
73
+ python-dateutil==2.8.2
74
+ pytz==2022.7.1
75
+ PyYAML==6.0
76
+ pyzmq==25.0.0
77
+ regex==2022.10.31
78
+ requests==2.28.2
79
+ responses==0.18.0
80
+ setuptools==63.4.3
81
+ six==1.16.0
82
+ stack-data==0.6.2
83
+ toolz==0.12.0
84
+ tornado==6.2
85
+ tqdm==4.64.1
86
+ traitlets==5.9.0
87
+ typing_extensions==4.4.0
88
+ urllib3==1.26.14
89
+ wcwidth==0.2.6
90
+ xxhash==3.2.0
91
+ yarl==1.8.2
92
+ zstandard==0.19.0