{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "path = \"/home/a03-sgoel/mESMerize/benchmarks/DeepLoc\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0ACCKingdomPartitionPeripheralTransmembraneLipidAnchorSolubleSequence
00I3R9M8Archaea01000MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...
11I3R9M9Archaea11000MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...
22Q7ZAG8Archaea21000MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...
33Q8PZ67Archaea01001MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...
44Q9YGA6Archaea01000MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...
..............................
2802128021P86949Eukaryota00001MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...
2802228022P86950Eukaryota00001MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...
2802328023P86951Eukaryota00001MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...
2802428024P86983Eukaryota30001MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...
2802528025P86984Eukaryota40001MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...
\n", "

28026 rows × 9 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 ACC Kingdom Partition Peripheral Transmembrane \\\n", "0 0 I3R9M8 Archaea 0 1 0 \n", "1 1 I3R9M9 Archaea 1 1 0 \n", "2 2 Q7ZAG8 Archaea 2 1 0 \n", "3 3 Q8PZ67 Archaea 0 1 0 \n", "4 4 Q9YGA6 Archaea 0 1 0 \n", "... ... ... ... ... ... ... \n", "28021 28021 P86949 Eukaryota 0 0 0 \n", "28022 28022 P86950 Eukaryota 0 0 0 \n", "28023 28023 P86951 Eukaryota 0 0 0 \n", "28024 28024 P86983 Eukaryota 3 0 0 \n", "28025 28025 P86984 Eukaryota 4 0 0 \n", "\n", " LipidAnchor Soluble Sequence \n", "0 0 0 MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG... \n", "1 0 0 MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR... \n", "2 0 0 MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP... \n", "3 0 1 MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY... \n", "4 0 0 MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL... \n", "... ... ... ... \n", "28021 0 1 MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY... \n", "28022 0 1 MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG... \n", "28023 0 1 MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG... \n", "28024 0 1 MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL... \n", "28025 0 1 MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL... \n", "\n", "[28026 rows x 9 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACCKingdomPartitionPeripheralTransmembraneLipidAnchorSolubleSequence
0I3R9M8Archaea01000MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG...
1I3R9M9Archaea11000MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR...
2Q7ZAG8Archaea21000MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP...
3Q8PZ67Archaea01001MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY...
4Q9YGA6Archaea01000MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL...
...........................
28021P86949Eukaryota00001MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY...
28022P86950Eukaryota00001MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG...
28023P86951Eukaryota00001MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG...
28024P86983Eukaryota30001MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL...
28025P86984Eukaryota40001MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL...
\n", "

28026 rows × 8 columns

\n", "
" ], "text/plain": [ " ACC Kingdom Partition Peripheral Transmembrane LipidAnchor \\\n", "0 I3R9M8 Archaea 0 1 0 0 \n", "1 I3R9M9 Archaea 1 1 0 0 \n", "2 Q7ZAG8 Archaea 2 1 0 0 \n", "3 Q8PZ67 Archaea 0 1 0 0 \n", "4 Q9YGA6 Archaea 0 1 0 0 \n", "... ... ... ... ... ... ... \n", "28021 P86949 Eukaryota 0 0 0 0 \n", "28022 P86950 Eukaryota 0 0 0 0 \n", "28023 P86951 Eukaryota 0 0 0 0 \n", "28024 P86983 Eukaryota 3 0 0 0 \n", "28025 P86984 Eukaryota 4 0 0 0 \n", "\n", " Soluble Sequence \n", "0 0 MSTDSDAETVDLADGVDHQVAMVMDLNKCIGCQTCTVACKSLWTEG... \n", "1 0 MSRNDASQLDDGETTAESPPDDQANDAPEVGDPPGDPVDADSGVSR... \n", "2 0 MTKVLVLGGRFGALTAAYTLKRLVGSKADVKVINKSRFSYFRPALP... \n", "3 1 MPPKIAEVIQHDVCAACGACEAVCPIGAVTVKKAAEIRDPNDLSLY... \n", "4 0 MAGVRLVDVWKVFGEVTAVREMSLEVKDGEFMILLGPSGCGKTTTL... \n", "... ... ... \n", "28021 1 MLRFIAIVALIATVNAKGGTYGIGVLPSVTYVSGGGGGYPGIYGTY... \n", "28022 1 MKPFISLASLIVLIASASAGGDDDYGKYGYGSYGPGIGGIGGGGGG... \n", "28023 1 MLKLVCAVVLIATVNAKGSSPGFGIGQLPGITVVSGGVSGGSLSGG... \n", "28024 1 MHQSSLGVLVLFSLIYLCISVHVPFDLNGWKALRLDNNRVQDSTNL... \n", "28025 1 MLMLLCIIATVIPFSLVEGRKGCWADPTPPGKECLYGKEIHGGRNL... \n", "\n", "[28026 rows x 8 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(path + \"/OG_membrane_type_all.csv\")\n", "df = df.drop(columns=['Unnamed: 0'])\n", "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train = df[df['Partition'] != 4]\n", "test = df[df['Partition'] == 4]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "train.to_csv(path + \"/membrane_type_train.csv\", index=False)\n", "test.to_csv(path + \"/membrane_type_test.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }