Spaces:
Sleeping
Sleeping
Commit
·
edf802b
1
Parent(s):
9d7007a
data updated
Browse files- data_final.csv +2 -2
- project.ipynb +65 -2
data_final.csv
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
|
3 |
+
size 30694340
|
project.ipynb
CHANGED
@@ -421,11 +421,74 @@
|
|
421 |
},
|
422 |
{
|
423 |
"cell_type": "code",
|
424 |
-
"execution_count":
|
425 |
"metadata": {},
|
426 |
"outputs": [],
|
427 |
"source": [
|
428 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
429 |
]
|
430 |
}
|
431 |
],
|
|
|
421 |
},
|
422 |
{
|
423 |
"cell_type": "code",
|
424 |
+
"execution_count": 2,
|
425 |
"metadata": {},
|
426 |
"outputs": [],
|
427 |
"source": [
|
428 |
+
"import pandas as pd\n",
|
429 |
+
"df = pd.read_csv('data_final.csv')"
|
430 |
+
]
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"cell_type": "code",
|
434 |
+
"execution_count": 5,
|
435 |
+
"metadata": {},
|
436 |
+
"outputs": [
|
437 |
+
{
|
438 |
+
"name": "stderr",
|
439 |
+
"output_type": "stream",
|
440 |
+
"text": [
|
441 |
+
"/tmp/ipykernel_27471/112674152.py:2: SettingWithCopyWarning: \n",
|
442 |
+
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
443 |
+
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
444 |
+
"\n",
|
445 |
+
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
446 |
+
" df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n"
|
447 |
+
]
|
448 |
+
}
|
449 |
+
],
|
450 |
+
"source": [
|
451 |
+
"df = df[df['annotation'].str.split().str.len() >= 50]\n",
|
452 |
+
"df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n",
|
453 |
+
"df.reset_index(drop=True, inplace=True)"
|
454 |
+
]
|
455 |
+
},
|
456 |
+
{
|
457 |
+
"cell_type": "code",
|
458 |
+
"execution_count": 6,
|
459 |
+
"metadata": {},
|
460 |
+
"outputs": [
|
461 |
+
{
|
462 |
+
"name": "stdout",
|
463 |
+
"output_type": "stream",
|
464 |
+
"text": [
|
465 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
466 |
+
"RangeIndex: 17774 entries, 0 to 17773\n",
|
467 |
+
"Data columns (total 6 columns):\n",
|
468 |
+
" # Column Non-Null Count Dtype \n",
|
469 |
+
"--- ------ -------------- ----- \n",
|
470 |
+
" 0 page_url 17774 non-null object\n",
|
471 |
+
" 1 image_url 17774 non-null object\n",
|
472 |
+
" 2 author 17774 non-null object\n",
|
473 |
+
" 3 title 17774 non-null object\n",
|
474 |
+
" 4 annotation 17774 non-null object\n",
|
475 |
+
" 5 category 17774 non-null object\n",
|
476 |
+
"dtypes: object(6)\n",
|
477 |
+
"memory usage: 833.3+ KB\n"
|
478 |
+
]
|
479 |
+
}
|
480 |
+
],
|
481 |
+
"source": [
|
482 |
+
"df.info()"
|
483 |
+
]
|
484 |
+
},
|
485 |
+
{
|
486 |
+
"cell_type": "code",
|
487 |
+
"execution_count": 7,
|
488 |
+
"metadata": {},
|
489 |
+
"outputs": [],
|
490 |
+
"source": [
|
491 |
+
"df.to_csv('data_final.csv', index=False) "
|
492 |
]
|
493 |
}
|
494 |
],
|