valeriedaash commited on
Commit
edf802b
·
1 Parent(s): 9d7007a

data updated

Browse files
Files changed (2) hide show
  1. data_final.csv +2 -2
  2. project.ipynb +65 -2
data_final.csv CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6656361b1e4b9e9a9200fe464ca227c0ce285af2150975072f48e548a0fbd1b7
3
- size 32055320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
3
+ size 30694340
project.ipynb CHANGED
@@ -421,11 +421,74 @@
421
  },
422
  {
423
  "cell_type": "code",
424
- "execution_count": 83,
425
  "metadata": {},
426
  "outputs": [],
427
  "source": [
428
- "# filtered_df.to_csv('filtered_data_without_dubs.csv', index=False) "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  ]
430
  }
431
  ],
 
421
  },
422
  {
423
  "cell_type": "code",
424
+ "execution_count": 2,
425
  "metadata": {},
426
  "outputs": [],
427
  "source": [
428
+ "import pandas as pd\n",
429
+ "df = pd.read_csv('data_final.csv')"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": 5,
435
+ "metadata": {},
436
+ "outputs": [
437
+ {
438
+ "name": "stderr",
439
+ "output_type": "stream",
440
+ "text": [
441
+ "/tmp/ipykernel_27471/112674152.py:2: SettingWithCopyWarning: \n",
442
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
443
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
444
+ "\n",
445
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
446
+ " df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n"
447
+ ]
448
+ }
449
+ ],
450
+ "source": [
451
+ "df = df[df['annotation'].str.split().str.len() >= 50]\n",
452
+ "df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n",
453
+ "df.reset_index(drop=True, inplace=True)"
454
+ ]
455
+ },
456
+ {
457
+ "cell_type": "code",
458
+ "execution_count": 6,
459
+ "metadata": {},
460
+ "outputs": [
461
+ {
462
+ "name": "stdout",
463
+ "output_type": "stream",
464
+ "text": [
465
+ "<class 'pandas.core.frame.DataFrame'>\n",
466
+ "RangeIndex: 17774 entries, 0 to 17773\n",
467
+ "Data columns (total 6 columns):\n",
468
+ " # Column Non-Null Count Dtype \n",
469
+ "--- ------ -------------- ----- \n",
470
+ " 0 page_url 17774 non-null object\n",
471
+ " 1 image_url 17774 non-null object\n",
472
+ " 2 author 17774 non-null object\n",
473
+ " 3 title 17774 non-null object\n",
474
+ " 4 annotation 17774 non-null object\n",
475
+ " 5 category 17774 non-null object\n",
476
+ "dtypes: object(6)\n",
477
+ "memory usage: 833.3+ KB\n"
478
+ ]
479
+ }
480
+ ],
481
+ "source": [
482
+ "df.info()"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": 7,
488
+ "metadata": {},
489
+ "outputs": [],
490
+ "source": [
491
+ "df.to_csv('data_final.csv', index=False) "
492
  ]
493
  }
494
  ],