chmanoj
/

kenlm_te

Model card Files Files and versions Community

chmanoj commited on Feb 3, 2022

Commit

c82a77a

1 Parent(s): 2ea8870

Update src files

Browse files

Files changed (4) hide show

.gitattributes +1 -1
.gitignore +1 -0
src/Create_LM.ipynb +44 -44
src/{text.txt → kenlm_text_te.txt} +0 -0

.gitattributes CHANGED Viewed

@@ -1,6 +1,6 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
-text.txt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bin.* filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
+*kenlm_text_te.txt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bin.* filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .ipynb_checkpoints/

src/Create_LM.ipynb CHANGED Viewed

@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "5e445ce4-1507-482d-a2a8-03d8802e6311",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -12,14 +12,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "1c1820bc-0125-4589-983f-e454801435a5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "117e880c8ae8437e9a16ccdf20b659eb",
        "version_major": 2,
        "version_minor": 0
       },
@@ -34,20 +34,20 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "bd1bfffa9a424a45b3b7324458818f4a",
        "version_major": 2,
        "version_minor": 0
       },
@@ -61,7 +61,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "22a24004a7a546ea88bf7c3fe1c16e46",
        "version_major": 2,
        "version_minor": 0
       },
@@ -75,7 +75,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "9e4a161541734dfbb2de2d3dd46d8753",
        "version_major": 2,
        "version_minor": 0
       },
@@ -89,7 +89,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "992db97134c94b9284b421c7f3ea0b33",
        "version_major": 2,
        "version_minor": 0
       },
@@ -104,7 +104,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Dataset parquet downloaded and prepared to /home/manoj/.cache/huggingface/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-a0473fa2e2573d48/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
      ]
     }
    ],
@@ -115,18 +115,18 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "62fb01f7-24fe-4384-9940-3c262c321a5d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "with open(\"text.txt\", \"w\") as file:\n",
     "  file.write(\" \".join(dataset[\"text\"]))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4295ab4b-b4d8-4a39-a896-fb86503e4674",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -134,13 +134,13 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "fcc0b573-516a-45d6-af2a-feace521c16d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/mnt/c/Projects/Speech/xls-R-finetuning/lm_te'"
       ]
      },
      "execution_count": 5,
@@ -155,8 +155,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "e1f8f887-6201-4ae0-989e-8bdc57816db1",
    "metadata": {},
    "outputs": [
     {
@@ -164,12 +164,12 @@
      "output_type": "stream",
      "text": [
       "=== 1/5 Counting and sorting n-grams ===\n",
-      "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
       "Unigram tokens 32852369 types 1308846\n",
       "=== 2/5 Calculating and sorting adjusted counts ===\n",
-      "Chain sizes: 1:15706152 2:2291295744 3:4296179712\n",
       "Statistics:\n",
       "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
       "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
@@ -193,18 +193,18 @@
       "=== 5/5 Writing ARPA model ===\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
-      "Name:lmplz\tVmPeak:6613460 kB\tVmRSS:37976 kB\tRSSMax:1975488 kB\tuser:33.1964\tsys:9.29228\tCPU:42.4891\treal:65.5831\n"
      ]
     }
    ],
    "source": [
-    "!../kenlm/build/bin/lmplz -o 3 <\"text.txt\" > \"3gram.arpa\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "afee7f94-f247-4891-822e-1f4edd5abc81",
    "metadata": {},
    "outputs": [
     {
@@ -212,12 +212,12 @@
      "output_type": "stream",
      "text": [
       "=== 1/5 Counting and sorting n-grams ===\n",
-      "Reading /mnt/c/Projects/Speech/xls-R-finetuning/lm_te/text.txt\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
       "Unigram tokens 32852369 types 1308846\n",
       "=== 2/5 Calculating and sorting adjusted counts ===\n",
-      "Chain sizes: 1:15706152 2:642680448 3:1205025920 4:1928041344 5:2811727104\n",
       "Statistics:\n",
       "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
       "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
@@ -243,40 +243,40 @@
       "=== 5/5 Writing ARPA model ===\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
-      "Name:lmplz\tVmPeak:6620664 kB\tVmRSS:38084 kB\tRSSMax:2239444 kB\tuser:77.3579\tsys:28.8403\tCPU:106.198\treal:159.405\n"
      ]
     }
    ],
    "source": [
-    "!../kenlm/build/bin/lmplz -o 5 <\"text.txt\" > \"5gram.arpa\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4d4f8526-fb6a-40cc-bf02-75c78b4138cd",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "33e3c247-1b4b-4e61-a42e-283bef351c4b",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 22.7 s, sys: 6.28 s, total: 28.9 s\n",
-      "Wall time: 1min 29s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "with open(\"3gram.arpa\", \"r\") as read_file, open(\"3gram_correct.arpa\", \"w\") as write_file:\n",
     "  has_added_eos = False\n",
     "  for line in read_file:\n",
     "    if not has_added_eos and \"ngram 1=\" in line:\n",
@@ -292,22 +292,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "0f8ead29-e478-48dd-ace5-46d787d3d68e",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1min 25s, sys: 27.2 s, total: 1min 52s\n",
-      "Wall time: 5min 28s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "with open(\"5gram.arpa\", \"r\") as read_file, open(\"5gram_correct.arpa\", \"w\") as write_file:\n",
     "  has_added_eos = False\n",
     "  for line in read_file:\n",
     "    if not has_added_eos and \"ngram 1=\" in line:\n",
@@ -324,7 +324,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ad4ea204-d61c-4316-bc30-5bbda696d225",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -332,7 +332,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "152fecfe-9a51-4f6d-9640-c810adb5e456",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -340,7 +340,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -354,7 +354,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.10"
   }
  },
  "nbformat": 4,

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
+   "id": "451d890e",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
+   "id": "eb0e4037",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3451cb7648e349cbbbdea3b672207ef7",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "Using custom data configuration chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035\n"
      ]
     },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Downloading and preparing dataset samanantar/te (download: 292.93 MiB, generated: 678.62 MiB, post-processed: Unknown size, total: 971.55 MiB) to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...\n"
      ]
     },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "68ea006ea9b943c3af2ed5ee7bb9fffb",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b5276db8e4614107ad0bdfe67ccca2fd",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0d3e27b107e7401dbe7f5dad8aa7ec08",
        "version_major": 2,
        "version_minor": 0
       },
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ead9e8fde9a842b295955332ecae540d",
        "version_major": 2,
        "version_minor": 0
       },
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Dataset parquet downloaded and prepared to /workspace/cache/hf/datasets/parquet/chmanoj--ai4bharat__samanantar_processed_te-ec4e27c180ab4035/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.\n"
      ]
     }
    ],
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "e4f4f4e8",
    "metadata": {},
    "outputs": [],
    "source": [
+    "with open(\"kenlm_text_te.txt\", \"w\") as file:\n",
     "  file.write(\" \".join(dataset[\"text\"]))"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "6e8a0e84",
    "metadata": {},
    "outputs": [],
    "source": []
   {
    "cell_type": "code",
    "execution_count": 5,
+   "id": "5dfbf3e1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "'/workspace/kenlm_te/src'"
       ]
      },
      "execution_count": 5,
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
+   "id": "494bec1a",
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "=== 1/5 Counting and sorting n-grams ===\n",
+      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
       "Unigram tokens 32852369 types 1308846\n",
       "=== 2/5 Calculating and sorting adjusted counts ===\n",
+      "Chain sizes: 1:15706152 2:51606089728 3:96761421824\n",
       "Statistics:\n",
       "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
       "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
       "=== 5/5 Writing ARPA model ===\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
+      "Name:lmplz\tVmPeak:145080616 kB\tVmRSS:38292 kB\tRSSMax:33928732 kB\tuser:43.6485\tsys:27.5682\tCPU:71.2168\treal:64.983\n"
      ]
     }
    ],
    "source": [
+    "!../../kenlm/build/bin/lmplz -o 3 <\"kenlm_text_te.txt\" > \"../3gram.arpa\""
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
+   "id": "c2c8c8ce",
    "metadata": {},
    "outputs": [
     {
      "output_type": "stream",
      "text": [
       "=== 1/5 Counting and sorting n-grams ===\n",
+      "Reading /workspace/kenlm_te/src/kenlm_text_te.txt\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
       "Unigram tokens 32852369 types 1308846\n",
       "=== 2/5 Calculating and sorting adjusted counts ===\n",
+      "Chain sizes: 1:15706152 2:14474877952 3:27140399104 4:43424632832 5:63327596544\n",
       "Statistics:\n",
       "1 1308845 D1=0.726852 D2=1.02775 D3+=1.30996\n",
       "2 12720239 D1=0.818931 D2=1.12897 D3+=1.32699\n",
       "=== 5/5 Writing ARPA model ===\n",
       "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
       "****************************************************************************************************\n",
+      "Name:lmplz\tVmPeak:145104204 kB\tVmRSS:38296 kB\tRSSMax:26419104 kB\tuser:89.0779\tsys:42.0565\tCPU:131.134\treal:97.4678\n"
      ]
     }
    ],
    "source": [
+    "!../../kenlm/build/bin/lmplz -o 5 <\"kenlm_text_te.txt\" > \"../5gram.arpa\""
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "62b727b7",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
+   "id": "c27f1ef3",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "CPU times: user 19.1 s, sys: 3.81 s, total: 22.9 s\n",
+      "Wall time: 22.9 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
+    "with open(\"../3gram.arpa\", \"r\") as read_file, open(\"../3gram_correct.arpa\", \"w\") as write_file:\n",
     "  has_added_eos = False\n",
     "  for line in read_file:\n",
     "    if not has_added_eos and \"ngram 1=\" in line:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
+   "id": "8c8d963b",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "CPU times: user 1min 5s, sys: 12.8 s, total: 1min 18s\n",
+      "Wall time: 1min 18s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
+    "with open(\"../5gram.arpa\", \"r\") as read_file, open(\"../5gram_correct.arpa\", \"w\") as write_file:\n",
     "  has_added_eos = False\n",
     "  for line in read_file:\n",
     "    if not has_added_eos and \"ngram 1=\" in line:\n",
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "9447691c",
    "metadata": {},
    "outputs": [],
    "source": []
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "95d50071",
    "metadata": {},
    "outputs": [],
    "source": []
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.8.8"
   }
  },
  "nbformat": 4,

src/{text.txt → kenlm_text_te.txt} RENAMED Viewed

File without changes