{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "source": [ "# Importing necessary libraries\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.preprocessing import LabelEncoder\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ], "metadata": { "id": "BR18gDsoQIka" }, "execution_count": 1, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "\n", "# Mount Google Drive\n", "drive.mount('/content/drive')" ], "metadata": { "id": "h_60ELJv-HPZ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "132e363b-d885-4c88-ed30-bbed8b9b9d6c" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "code", "source": [ "# Specifying filepath from the Google drive\n", "file_path = '/content/drive/MyDrive/eDNA/BOLD database/BOLD_Public.19-Apr-2024.tsv'" ], "metadata": { "id": "NXRh7rBR19x-" }, "execution_count": 3, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "# Creating an empty list to hold the fields\n", "data = []\n", "\n", "# Expected number of fields\n", "e_n_of_f = 56\n", "\n", "with open(file_path, 'r') as f:\n", " for line in f:\n", " fields = line.strip().split('\\t')\n", " if len(fields) == e_n_of_f:\n", " data.append(fields)\n", "\n", "# Create dataFrame and set the index\n", "df_0 = pd.DataFrame(data)\n", "\n", "#df.reset_index()\n", "\n", "# Setting the header\n", "df_0.columns = df_0.iloc[0]\n", "\n", "# Dropping duplicate header\n", "df_0=df_0.drop(index=0)\n", "\n", "# Inspecting the first 5 few rows\n", "df_0.head()\n" ], "metadata": { "id": "Pmmmr_ogRL2Z", "colab": { "base_uri": "https://localhost:8080/", "height": 516 }, "outputId": "5c13a74b-621b-427b-fcb7-6aa01a7c5f48" }, "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0 processid sampleid specimenid museumid fieldid \\\n", "1 AAASF001-17 CBGSFMX-0101 7804897 None CBGSFMX-0101 \n", "2 AAASF004-17 CBGSFMX-0301 7804900 None CBGSFMX-0301 \n", "3 AAASF005-17 CBGSFMX-0302 7804901 None CBGSFMX-0302 \n", "4 AAASF006-17 CBGSFMX-0303 7804902 None CBGSFMX-0303 \n", "5 AAASF007-17 CBGSFMX-0304 7804903 None CBGSFMX-0304 \n", "\n", "0 inst bin_uri identification \\\n", "1 Universidad Autonoma de Nuevo Leon BOLD:ADP3520 Lutzomyia cruciata \n", "2 Universidad Autonoma de Nuevo Leon BOLD:AAY5017 Lutzomyia longipalpis \n", "3 Universidad Autonoma de Nuevo Leon BOLD:AAY5017 Lutzomyia longipalpis \n", "4 Universidad Autonoma de Nuevo Leon BOLD:AAY5017 Lutzomyia longipalpis \n", "5 Universidad Autonoma de Nuevo Leon BOLD:AAY5017 Lutzomyia longipalpis \n", "\n", "0 funding_src kingdom ... species_reference identification_method \\\n", "1 None Animalia ... None Morphological \n", "2 None Animalia ... None Morphological \n", "3 None Animalia ... None Morphological \n", "4 None Animalia ... None Morphological \n", "5 None Animalia ... None Morphological \n", "\n", "0 recordset_code_arr gb_acs marker_code \\\n", "1 ['AAASF', 'DS-17IBMWP', 'DS-UNIQUE17'] MK851247 COI-5P \n", "2 ['AAASF', 'DS-17IBMWP'] MK851267 COI-5P \n", "3 ['AAASF'] MK851266 COI-5P \n", "4 ['AAASF', 'DS-17IBMWP'] MK851265 COI-5P \n", "5 ['AAASF', 'DS-17IBMWP'] MK851254 COI-5P \n", "\n", "0 nucraw \\\n", "1 AACATTATATTTTATTTTTGGAGCCTGAGCAGGAATAGTGGGAACA... \n", "2 GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... \n", "3 GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... \n", "4 GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... \n", "5 GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... \n", "\n", "0 sequence_run_site processid_minted_date \\\n", "1 Instituto Politecnico Nacional, Centro de Biot... 30-May-2017 \n", "2 Instituto Politecnico Nacional, Centro de Biot... 30-May-2017 \n", "3 Instituto Politecnico Nacional, Centro de Biot... 30-May-2017 \n", "4 Instituto Politecnico Nacional, Centro de Biot... 30-May-2017 \n", "5 Instituto Politecnico Nacional, Centro de Biot... 30-May-2017 \n", "\n", "0 sequence_upload_date identification_rank \n", "1 12-Jun-2017 species \n", "2 14-Jun-2017 species \n", "3 14-Jun-2017 species \n", "4 14-Jun-2017 species \n", "5 14-Jun-2017 species \n", "\n", "[5 rows x 56 columns]" ], "text/html": [ "\n", "
\n", " | processid | \n", "sampleid | \n", "specimenid | \n", "museumid | \n", "fieldid | \n", "inst | \n", "bin_uri | \n", "identification | \n", "funding_src | \n", "kingdom | \n", "... | \n", "species_reference | \n", "identification_method | \n", "recordset_code_arr | \n", "gb_acs | \n", "marker_code | \n", "nucraw | \n", "sequence_run_site | \n", "processid_minted_date | \n", "sequence_upload_date | \n", "identification_rank | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "AAASF001-17 | \n", "CBGSFMX-0101 | \n", "7804897 | \n", "None | \n", "CBGSFMX-0101 | \n", "Universidad Autonoma de Nuevo Leon | \n", "BOLD:ADP3520 | \n", "Lutzomyia cruciata | \n", "None | \n", "Animalia | \n", "... | \n", "None | \n", "Morphological | \n", "['AAASF', 'DS-17IBMWP', 'DS-UNIQUE17'] | \n", "MK851247 | \n", "COI-5P | \n", "AACATTATATTTTATTTTTGGAGCCTGAGCAGGAATAGTGGGAACA... | \n", "Instituto Politecnico Nacional, Centro de Biot... | \n", "30-May-2017 | \n", "12-Jun-2017 | \n", "species | \n", "
2 | \n", "AAASF004-17 | \n", "CBGSFMX-0301 | \n", "7804900 | \n", "None | \n", "CBGSFMX-0301 | \n", "Universidad Autonoma de Nuevo Leon | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "None | \n", "Animalia | \n", "... | \n", "None | \n", "Morphological | \n", "['AAASF', 'DS-17IBMWP'] | \n", "MK851267 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "Instituto Politecnico Nacional, Centro de Biot... | \n", "30-May-2017 | \n", "14-Jun-2017 | \n", "species | \n", "
3 | \n", "AAASF005-17 | \n", "CBGSFMX-0302 | \n", "7804901 | \n", "None | \n", "CBGSFMX-0302 | \n", "Universidad Autonoma de Nuevo Leon | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "None | \n", "Animalia | \n", "... | \n", "None | \n", "Morphological | \n", "['AAASF'] | \n", "MK851266 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "Instituto Politecnico Nacional, Centro de Biot... | \n", "30-May-2017 | \n", "14-Jun-2017 | \n", "species | \n", "
4 | \n", "AAASF006-17 | \n", "CBGSFMX-0303 | \n", "7804902 | \n", "None | \n", "CBGSFMX-0303 | \n", "Universidad Autonoma de Nuevo Leon | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "None | \n", "Animalia | \n", "... | \n", "None | \n", "Morphological | \n", "['AAASF', 'DS-17IBMWP'] | \n", "MK851265 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "Instituto Politecnico Nacional, Centro de Biot... | \n", "30-May-2017 | \n", "14-Jun-2017 | \n", "species | \n", "
5 | \n", "AAASF007-17 | \n", "CBGSFMX-0304 | \n", "7804903 | \n", "None | \n", "CBGSFMX-0304 | \n", "Universidad Autonoma de Nuevo Leon | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "None | \n", "Animalia | \n", "... | \n", "None | \n", "Morphological | \n", "['AAASF', 'DS-17IBMWP'] | \n", "MK851254 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "Instituto Politecnico Nacional, Centro de Biot... | \n", "30-May-2017 | \n", "14-Jun-2017 | \n", "species | \n", "
5 rows × 56 columns
\n", "\n", " | 0 | \n", "
---|---|
0 | \n", "None | \n", "
1 | \n", "1.1 Boreal Forest | \n", "
2 | \n", "Savannah | \n", "
3 | \n", "Plankton | \n", "
4 | \n", "Water column | \n", "
... | \n", "... | \n", "
71 | \n", "Poa foliosa (tussock) | \n", "
72 | \n", "Chicken plots | \n", "
73 | \n", "4. Native Grassland | 4.1. Tundra | \n", "
74 | \n", "4. Native Grassland|4.2 Subarctic Grassland | \n", "
75 | \n", "1. Forest|1.2 Subarctic Forest | \n", "
76 rows × 1 columns
\n", "\n", " | processid | \n", "specimenid | \n", "bin_uri | \n", "identification | \n", "kingdom | \n", "phylum | \n", "class | \n", "order | \n", "family | \n", "subfamily | \n", "... | \n", "country | \n", "province | \n", "region | \n", "sector | \n", "site | \n", "habitat | \n", "gb_acs | \n", "marker_code | \n", "nucraw | \n", "identification_rank | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "AAASF001-17 | \n", "7804897 | \n", "BOLD:ADP3520 | \n", "Lutzomyia cruciata | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Mexico | \n", "Quintana Roo | \n", "Candelaria | \n", "None | \n", "None | \n", "None | \n", "MK851247 | \n", "COI-5P | \n", "AACATTATATTTTATTTTTGGAGCCTGAGCAGGAATAGTGGGAACA... | \n", "species | \n", "
2 | \n", "AAASF004-17 | \n", "7804900 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Mexico | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851267 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "
3 | \n", "AAASF005-17 | \n", "7804901 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Mexico | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851266 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "
4 | \n", "AAASF006-17 | \n", "7804902 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Mexico | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851265 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "
5 | \n", "AAASF007-17 | \n", "7804903 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Mexico | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851254 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "
5 rows × 28 columns
\n", "\n", " | processid | \n", "specimenid | \n", "bin_uri | \n", "identification | \n", "kingdom | \n", "phylum | \n", "class | \n", "order | \n", "family | \n", "subfamily | \n", "... | \n", "province | \n", "region | \n", "sector | \n", "site | \n", "habitat | \n", "gb_acs | \n", "marker_code | \n", "nucraw | \n", "identification_rank | \n", "habitat_type | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "AAASF001-17 | \n", "7804897 | \n", "BOLD:ADP3520 | \n", "Lutzomyia cruciata | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Quintana Roo | \n", "Candelaria | \n", "None | \n", "None | \n", "None | \n", "MK851247 | \n", "COI-5P | \n", "AACATTATATTTTATTTTTGGAGCCTGAGCAGGAATAGTGGGAACA... | \n", "species | \n", "None | \n", "
2 | \n", "AAASF004-17 | \n", "7804900 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851267 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "
3 | \n", "AAASF005-17 | \n", "7804901 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851266 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "
4 | \n", "AAASF006-17 | \n", "7804902 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851265 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "
5 | \n", "AAASF007-17 | \n", "7804903 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "Quintana Roo | \n", "Huayun | \n", "None | \n", "None | \n", "None | \n", "MK851254 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "
5 rows × 29 columns
\n", "\n", " | processid | \n", "specimenid | \n", "bin_uri | \n", "identification | \n", "kingdom | \n", "phylum | \n", "class | \n", "order | \n", "family | \n", "subfamily | \n", "... | \n", "sector | \n", "site | \n", "habitat | \n", "gb_acs | \n", "marker_code | \n", "nucraw | \n", "identification_rank | \n", "habitat_type | \n", "Latitude | \n", "Longitude | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "AAASF001-17 | \n", "7804897 | \n", "BOLD:ADP3520 | \n", "Lutzomyia cruciata | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "None | \n", "None | \n", "None | \n", "MK851247 | \n", "COI-5P | \n", "AACATTATATTTTATTTTTGGAGCCTGAGCAGGAATAGTGGGAACA... | \n", "species | \n", "None | \n", "NaN | \n", "NaN | \n", "
2 | \n", "AAASF004-17 | \n", "7804900 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "None | \n", "None | \n", "None | \n", "MK851267 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "NaN | \n", "NaN | \n", "
3 | \n", "AAASF005-17 | \n", "7804901 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "None | \n", "None | \n", "None | \n", "MK851266 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "NaN | \n", "NaN | \n", "
4 | \n", "AAASF006-17 | \n", "7804902 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "None | \n", "None | \n", "None | \n", "MK851265 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "NaN | \n", "NaN | \n", "
5 | \n", "AAASF007-17 | \n", "7804903 | \n", "BOLD:AAY5017 | \n", "Lutzomyia longipalpis | \n", "Animalia | \n", "Arthropoda | \n", "Insecta | \n", "Diptera | \n", "Psychodidae | \n", "Phlebotominae | \n", "... | \n", "None | \n", "None | \n", "None | \n", "MK851254 | \n", "COI-5P | \n", "GACTTTATATTTTATTTTCGGGGCTTGATCTGGAATAGTAGGGACA... | \n", "species | \n", "None | \n", "NaN | \n", "NaN | \n", "
5 rows × 30 columns
\n", "\n", " | Column | \n", "Original Value | \n", "Encoded Value | \n", "
---|---|---|---|
352523 | \n", "habitat_type | \n", "None | \n", "0 | \n", "
352524 | \n", "habitat_type | \n", "Forest | \n", "1 | \n", "
352525 | \n", "habitat_type | \n", "Grassland | \n", "2 | \n", "
352526 | \n", "habitat_type | \n", "Marine | \n", "3 | \n", "
352527 | \n", "habitat_type | \n", "Mixed Habitat | \n", "4 | \n", "
\n", " | processid | \n", "specimenid | \n", "bin_uri | \n", "identification | \n", "kingdom | \n", "phylum | \n", "class | \n", "order | \n", "family | \n", "subfamily | \n", "... | \n", "sector | \n", "site | \n", "habitat | \n", "gb_acs | \n", "marker_code | \n", "nucraw | \n", "identification_rank | \n", "habitat_type | \n", "Latitude | \n", "Longitude | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "1 | \n", "1 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "2 | \n", "2 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "2 | \n", "0 | \n", "2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "3 | \n", "3 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "3 | \n", "0 | \n", "3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 | \n", "4 | \n", "4 | \n", "1 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "4 | \n", "0 | \n", "4 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 30 columns
\n", "