Spaces:

valeriedaash
/

find_my_book

Sleeping

App Files Files Community

valeriedaash commited on Mar 14, 2024

Commit

edf802b

1 Parent(s): 9d7007a

data updated

Browse files

Files changed (2) hide show

data_final.csv +2 -2
project.ipynb +65 -2

data_final.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6656361b1e4b9e9a9200fe464ca227c0ce285af2150975072f48e548a0fbd1b7
-size 32055320

 version https://git-lfs.github.com/spec/v1
+oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
+size 30694340

project.ipynb CHANGED Viewed

@@ -421,11 +421,74 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# filtered_df.to_csv('filtered_data_without_dubs.csv', index=False) "
    ]
   }
  ],

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "df = pd.read_csv('data_final.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_27471/112674152.py:2: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+      "  df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = df[df['annotation'].str.split().str.len() >= 50]\n",
+    "df['annotation'] = df['annotation'].apply(lambda text: ''.join([symbol for symbol in text if symbol not in [';']]).strip().lower())\n",
+    "df.reset_index(drop=True, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 17774 entries, 0 to 17773\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   page_url    17774 non-null  object\n",
+      " 1   image_url   17774 non-null  object\n",
+      " 2   author      17774 non-null  object\n",
+      " 3   title       17774 non-null  object\n",
+      " 4   annotation  17774 non-null  object\n",
+      " 5   category    17774 non-null  object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 833.3+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv('data_final.csv', index=False) "
    ]
   }
  ],