diff --git "a/logs/main_log.txt" "b/logs/main_log.txt" --- "a/logs/main_log.txt" +++ "b/logs/main_log.txt" @@ -54723,3 +54723,2435 @@ Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch e [2021-10-22 18:29:04,542] [INFO] [stage2.py:113:__init__] CPU Offload: False [2021-10-22 18:29:04,542] [INFO] [stage2.py:114:__init__] Round robin gradient partitioning: False Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +/gpfswork/rech/six/commun/conda/cutting-edge/lib/python3.8/site-packages/torch/utils/cpp_extension.py:283: UserWarning: + + !! WARNING !! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +Your compiler (c++) is not compatible with the compiler Pytorch was +built with for this platform, which is g++ on linux. Please +use g++ to to compile your extension. Alternatively, you may +compile PyTorch from source using c++, and then you can also use +c++ to compile your extension. + +See https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md for help +with compiling PyTorch from source. +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + !! WARNING !! + + warnings.warn(WRONG_COMPILER_WARNING.format( +Emitting ninja build file /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +ninja: no work to do. +Loading extension module utils... +Time to load utils op: 1.0921554565429688 seconds +Loading extension module utils...Loading extension module utils... + +Loading extension module utils... +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils... +Loading extension module utils... + + +Time to load utils op: 1.1780741214752197 secondsTime to load utils op: 1.1663405895233154 seconds +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + + +Time to load utils op: 1.1632099151611328 seconds +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils... +Loading extension module utils... + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Loading extension module utils...Loading extension module utils...Loading extension module utils... + + +Loading extension module utils... +Loading extension module utils...Loading extension module utils...Loading extension module utils...Loading extension module utils... + + + +Time to load utils op: 1.1792736053466797 secondsTime to load utils op: 1.182255506515503 secondsTime to load utils op: 1.185680866241455 seconds + + +Time to load utils op: 1.185666561126709 seconds +Time to load utils op: 1.1872801780700684 secondsTime to load utils op: 1.183830976486206 secondsTime to load utils op: 1.1868786811828613 seconds + + +Time to load utils op: 1.1926240921020508 seconds +Time to load utils op: 1.1894752979278564 secondsTime to load utils op: 1.193985939025879 secondsTime to load utils op: 1.1918563842773438 seconds + + +Time to load utils op: 1.1861236095428467 seconds +Time to load utils op: 1.183532953262329 seconds +Time to load utils op: 1.182396650314331 secondsTime to load utils op: 1.1855201721191406 seconds + +Time to load utils op: 1.1853570938110352 seconds +Time to load utils op: 1.1870622634887695 seconds +Time to load utils op: 1.1906986236572266 seconds +Time to load utils op: 1.191178560256958 seconds +Time to load utils op: 1.1916847229003906 seconds +Time to load utils op: 1.1948001384735107 secondsTime to load utils op: 1.1928622722625732 secondsTime to load utils op: 1.1893346309661865 secondsTime to load utils op: 1.1953864097595215 seconds + + + +Time to load utils op: 1.1961758136749268 seconds +Time to load utils op: 1.1909070014953613 seconds +Time to load utils op: 1.1915271282196045 secondsTime to load utils op: 1.1911966800689697 seconds + +Time to load utils op: 1.1896867752075195 seconds +Time to load utils op: 1.186239242553711 seconds +Time to load utils op: 1.1903300285339355 secondsTime to load utils op: 1.190995454788208 seconds + +Time to load utils op: 1.1872761249542236 seconds +Time to load utils op: 1.1909749507904053 secondsTime to load utils op: 1.1920750141143799 secondsTime to load utils op: 1.1917970180511475 seconds + +Time to load utils op: 1.1935746669769287 secondsTime to load utils op: 1.1863646507263184 secondsTime to load utils op: 1.184662103652954 seconds + +Time to load utils op: 1.1859581470489502 seconds +Time to load utils op: 1.1904394626617432 seconds +Time to load utils op: 1.190424919128418 secondsTime to load utils op: 1.1903162002563477 seconds + +Time to load utils op: 1.1942470073699951 seconds + + +Time to load utils op: 1.0839619636535645 secondsTime to load utils op: 1.0974063873291016 seconds +Time to load utils op: 1.097074031829834 seconds + +Time to load utils op: 1.1012611389160156 seconds +Time to load utils op: 1.1827380657196045 secondsTime to load utils op: 1.182737112045288 secondsTime to load utils op: 1.1833257675170898 seconds + + +Time to load utils op: 1.1852617263793945 secondsTime to load utils op: 1.1816432476043701 seconds + +Time to load utils op: 1.181161880493164 seconds +Time to load utils op: 1.184781551361084 secondsTime to load utils op: 1.1848835945129395 seconds + +Time to load utils op: 1.1905791759490967 secondsTime to load utils op: 1.192556381225586 secondsTime to load utils op: 1.1930129528045654 seconds + + +Time to load utils op: 1.1893703937530518 seconds +Time to load utils op: 1.185863971710205 secondsTime to load utils op: 1.1858327388763428 seconds +Time to load utils op: 1.1841635704040527 seconds + +Time to load utils op: 1.1857895851135254 seconds +Time to load utils op: 1.188225269317627 seconds +Time to load utils op: 1.1959540843963623 seconds +Time to load utils op: 1.1878750324249268 seconds +Time to load utils op: 1.1954319477081299 seconds +Time to load utils op: 1.1907052993774414 seconds +Time to load utils op: 1.1855642795562744 secondsTime to load utils op: 1.1845283508300781 seconds + +Time to load utils op: 1.1984584331512451 secondsTime to load utils op: 1.1847038269042969 seconds + +Time to load utils op: 1.1923136711120605 secondsTime to load utils op: 1.1920886039733887 seconds + +Time to load utils op: 1.1991586685180664 seconds +Time to load utils op: 1.1858468055725098 secondsTime to load utils op: 1.1842548847198486 seconds + +Time to load utils op: 1.1868107318878174 seconds +Time to load utils op: 1.1892352104187012 seconds +Time to load utils op: 1.1863431930541992 seconds +Time to load utils op: 1.1912565231323242 secondsTime to load utils op: 1.190610408782959 secondsTime to load utils op: 1.1903424263000488 seconds + + +Time to load utils op: 1.1893854141235352 secondsTime to load utils op: 1.186546802520752 secondsTime to load utils op: 1.1877250671386719 secondsTime to load utils op: 1.1907117366790771 seconds + + + +Time to load utils op: 1.1857829093933105 seconds +Time to load utils op: 1.188356876373291 secondsTime to load utils op: 1.1991848945617676 seconds + +Time to load utils op: 1.0981643199920654 seconds +Time to load utils op: 1.1980187892913818 secondsTime to load utils op: 1.1853604316711426 secondsTime to load utils op: 1.1852247714996338 seconds + + +Time to load utils op: 1.1840436458587646 seconds +Time to load utils op: 1.1878883838653564 seconds +Time to load utils op: 1.0941288471221924 seconds +Time to load utils op: 1.0943520069122314 secondsTime to load utils op: 1.0957441329956055 seconds + +Time to load utils op: 1.1859443187713623 seconds +Time to load utils op: 1.1984052658081055 seconds +Time to load utils op: 1.1855909824371338 seconds +Time to load utils op: 1.1857118606567383 seconds +Time to load utils op: 1.1948862075805664 secondsTime to load utils op: 1.1982452869415283 secondsTime to load utils op: 1.1892218589782715 seconds + + +Time to load utils op: 1.1890010833740234 seconds +Time to load utils op: 1.187697410583496 secondsTime to load utils op: 1.1875190734863281 seconds + +Time to load utils op: 1.1865684986114502 secondsTime to load utils op: 1.1842272281646729 secondsTime to load utils op: 1.187574863433838 seconds + + +Time to load utils op: 1.188019037246704 seconds +Time to load utils op: 1.1860601902008057 seconds +Time to load utils op: 1.1866354942321777 seconds +Time to load utils op: 1.1921167373657227 seconds +Time to load utils op: 1.1922132968902588 seconds +Time to load utils op: 1.192134141921997 secondsTime to load utils op: 1.1908316612243652 seconds + +Time to load utils op: 1.1874518394470215 secondsTime to load utils op: 1.1868841648101807 seconds +Time to load utils op: 1.1852307319641113 seconds + +Time to load utils op: 1.1865730285644531 seconds +Rank: 7 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 5 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 83 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 24 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 40 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 42 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 116 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 120 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 88 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 62 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 26 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 105 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 59 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 16 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 113 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 36 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 123 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 118 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 52 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 71 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 65 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 99 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 60 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 51 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 13 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 87 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 55 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 28 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 72 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 32 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 57 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 66 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 100 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 101 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 44 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 45 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 22 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 8 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 107 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 39 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 33 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 19 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 91 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 75 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 115 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 31 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 21 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 76 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 77 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 9 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 110 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 15 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 111 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 70 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 41 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 37 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 80 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 68 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 56 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 109 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 69 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 73 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 61 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 112 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 17 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 43 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 53 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 104 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 64 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 108 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 89 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 90 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 67 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 35 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 47 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 82 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 11 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 103 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 74 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 25 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 27 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 121 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 30 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 78 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 97 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 81 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 114 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 86 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 79 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 94 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 23 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 20 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 106 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 34 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 117 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 63 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 29 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 102 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 58 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 18 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 14 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 54 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 46 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 93 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 96 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 10 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 38 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 119 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 84 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 92 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 4 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 85 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 95 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 49 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 122 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 127 partition count [1, 1] and sizes[(978112000, False), (203000, False)] +Rank: 98 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 12 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 6 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 125 partition count [1, 1] and sizes[(978112000, False), (203000, False)] +Rank: 2 partition count [1, 1] and sizes[(978112000, False), (179800, False)] +Rank: 124 partition count [1, 1] and sizes[(978112000, False), (203000, False)] +Rank: 126 partition count [1, 1] and sizes[(978112000, False), (203000, False)] +Rank: 1 partition count [1, 1] and sizes[(978112000, False), (179800, False)] +Rank: 3 partition count [1, 1] and sizes[(978112000, False), (179800, False)] +Rank: 48 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Rank: 0 partition count [1, 1] and sizes[(978112000, False), (179800, False)] +Rank: 50 partition count [1, 1] and sizes[(807360000, False), (179800, False)] +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0026960372924804688 seconds +Time to load utils op: 0.002483367919921875 secondsTime to load utils op: 0.002634763717651367 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 0.0012998580932617188 seconds +Time to load utils op: 0.0010313987731933594 seconds +Time to load utils op: 0.0010967254638671875 seconds +Time to load utils op: 0.0010802745819091797 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0012714862823486328 seconds +Time to load utils op: 0.000995635986328125 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009961128234863281 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009837150573730469 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0013582706451416016 seconds +Time to load utils op: 0.0010190010070800781 secondsTime to load utils op: 0.001050710678100586 seconds + +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0011816024780273438 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0009911060333251953 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0013473033905029297 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0015382766723632812 seconds +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0012924671173095703 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0014717578887939453 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.001100301742553711 seconds +Time to load utils op: 0.0012242794036865234 seconds +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.001262664794921875 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0011272430419921875 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010988712310791016 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010268688201904297 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0014963150024414062 secondsTime to load utils op: 0.001214742660522461 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0011036396026611328 secondsTime to load utils op: 0.0010530948638916016 seconds +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils...Loading extension module utils... + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0010874271392822266 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0012218952178955078 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010063648223876953 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.001064300537109375 secondsTime to load utils op: 0.0012054443359375 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010471343994140625 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009903907775878906 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils...Loading extension module utils... +Loading extension module utils... + +Time to load utils op: 0.0011501312255859375 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0010833740234375 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0011906623840332031 seconds +Time to load utils op: 0.001130819320678711 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.001440286636352539 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0010447502136230469 seconds +Time to load utils op: 0.001249074935913086 secondsTime to load utils op: 0.0010385513305664062 seconds +Time to load utils op: 0.0010800361633300781 seconds + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Loading extension module utils... +Time to load utils op: 0.001241922378540039 seconds +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +Time to load utils op: 0.0010404586791992188 seconds +Time to load utils op: 0.0014619827270507812 seconds +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0010890960693359375 seconds +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 0.0014483928680419922 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 0.0013782978057861328 seconds +Time to load utils op: 0.0013997554779052734 seconds +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Time to load utils op: 0.0010879039764404297 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0012125968933105469 seconds +Loading extension module utils...Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 0.001050710678100586 seconds +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils... +Time to load utils op: 0.001033782958984375 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0012385845184326172 seconds +Time to load utils op: 0.0012905597686767578 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Time to load utils op: 0.0011742115020751953 seconds +Time to load utils op: 0.0011260509490966797 seconds +Loading extension module utils... +Time to load utils op: 0.0011167526245117188 seconds +Time to load utils op: 0.0014221668243408203 seconds +Time to load utils op: 0.0011835098266601562 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010707378387451172 seconds +Time to load utils op: 0.0013043880462646484 seconds +Time to load utils op: 0.0014393329620361328 seconds +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010654926300048828 seconds +Time to load utils op: 0.0013363361358642578 seconds +Time to load utils op: 0.0010204315185546875 seconds +Time to load utils op: 0.0013217926025390625 secondsTime to load utils op: 0.0010361671447753906 seconds + +Time to load utils op: 0.0012404918670654297 secondsTime to load utils op: 0.0013911724090576172 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0012700557708740234 seconds +Time to load utils op: 0.0013744831085205078 seconds +Time to load utils op: 0.0014846324920654297 secondsTime to load utils op: 0.0012562274932861328 seconds + +Time to load utils op: 0.0012099742889404297 seconds +Time to load utils op: 0.0015120506286621094 seconds +Time to load utils op: 0.001157999038696289 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.001287221908569336 seconds +Time to load utils op: 0.0013818740844726562 seconds +Time to load utils op: 0.0011267662048339844 secondsTime to load utils op: 0.0011296272277832031 seconds + +Time to load utils op: 0.0013225078582763672 seconds +Time to load utils op: 0.0012676715850830078 seconds +Time to load utils op: 0.0015094280242919922 seconds +Time to load utils op: 0.0013973712921142578 seconds +Time to load utils op: 0.001547098159790039 seconds +Time to load utils op: 0.0011608600616455078 seconds +Time to load utils op: 0.001421213150024414 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0013811588287353516 secondsTime to load utils op: 0.0012068748474121094 seconds + +Time to load utils op: 0.0012369155883789062 seconds +Time to load utils op: 0.0012483596801757812 secondsTime to load utils op: 0.0014431476593017578 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.0010595321655273438 seconds +Time to load utils op: 0.0010542869567871094 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +Loading extension module utils... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Time to load utils op: 0.002034425735473633 seconds +Time to load utils op: 0.001974344253540039 seconds +Time to load utils op: 0.0019376277923583984 seconds +Time to load utils op: 0.0022072792053222656 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +No modifications detected for re-loaded extension module utils, skipping build step...Loading extension module utils... + +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0013778209686279297 seconds +Time to load utils op: 0.0012845993041992188 seconds +Time to load utils op: 0.0011565685272216797 seconds +[2021-10-22 18:29:07,491] [INFO] [utils.py:806:see_memory_usage] Before initializing optimizer states +Time to load utils op: 0.0012488365173339844 seconds +Time to load utils op: 0.0012938976287841797 secondsTime to load utils op: 0.0010938644409179688 seconds + +Time to load utils op: 0.00113677978515625 seconds +Time to load utils op: 0.0010135173797607422 seconds +Time to load utils op: 0.0024199485778808594 seconds +Time to load utils op: 0.0021665096282958984 seconds +[2021-10-22 18:29:07,492] [INFO] [utils.py:807:see_memory_usage] MA 5.47 GB Max_MA 7.29 GB CA 9.25 GB Max_CA 9 GB +Time to load utils op: 0.003129720687866211 seconds +Time to load utils op: 0.003068208694458008 seconds +[2021-10-22 18:29:07,492] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils...Loading extension module utils... + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root...Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0012657642364501953 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0011029243469238281 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009989738464355469 seconds +Time to load utils op: 0.00098419189453125 seconds +Time to load utils op: 0.0013327598571777344 seconds +Time to load utils op: 0.0013167858123779297 seconds +Time to load utils op: 0.0012409687042236328 secondsTime to load utils op: 0.0012412071228027344 seconds + +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step...No modifications detected for re-loaded extension module utils, skipping build step... + +Loading extension module utils...Loading extension module utils... + +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0013110637664794922 seconds +Time to load utils op: 0.0012788772583007812 seconds +Time to load utils op: 0.0012242794036865234 seconds +Time to load utils op: 0.0011944770812988281 seconds +[2021-10-22 18:29:07,538] [INFO] [utils.py:806:see_memory_usage] After initializing optimizer states +[2021-10-22 18:29:07,539] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 16.41 GB CA 20.19 GB Max_CA 20 GB +[2021-10-22 18:29:07,539] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% +[2021-10-22 18:29:07,539] [INFO] [stage2.py:474:__init__] optimizer state initialized +[2021-10-22 18:29:07,568] [INFO] [utils.py:806:see_memory_usage] After initializing ZeRO optimizer +[2021-10-22 18:29:07,568] [INFO] [utils.py:807:see_memory_usage] MA 12.76 GB Max_MA 12.76 GB CA 20.19 GB Max_CA 20 GB +[2021-10-22 18:29:07,568] [INFO] [utils.py:815:see_memory_usage] CPU Virtual Memory: used = 40.22 GB, percent = 21.5% +[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2021-10-22 18:29:07,569] [INFO] [engine.py:599:_configure_lr_scheduler] DeepSpeed using client LR scheduler +[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2021-10-22 18:29:07,569] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0], mom=[(0.9, 0.95), (0.9, 0.95)] +[2021-10-22 18:29:07,569] [INFO] [config.py:940:print] DeepSpeedEngine configuration: +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] allreduce_always_fp32 ........ False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] amp_enabled .................. False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] amp_params ................... False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] checkpoint_tag_validation_enabled True +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] checkpoint_tag_validation_fail False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] curriculum_enabled ........... True +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] curriculum_params ............ {'curriculum_type': 'seqlen', 'min_difficulty': 64, 'max_difficulty': 2048, 'schedule_type': 'fixed_linear', 'schedule_config': {'total_curriculum_step': 36000, 'difficulty_step': 8}} +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] dataloader_drop_last ......... False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] disable_allgather ............ False +[2021-10-22 18:29:07,569] [INFO] [config.py:944:print] dump_state ................... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] dynamic_loss_scale_args ...... {'init_scale': 4096, 'scale_window': 500, 'delayed_shift': 2, 'min_scale': 1} +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_enabled ........... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_gas_boundary_resolution 1 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_layer_name ........ bert.encoder.layer +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_layer_num ......... 0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_max_iter .......... 100 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_stability ......... 1e-06 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_tol ............... 0.01 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] eigenvalue_verbose ........... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] elasticity_enabled ........... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_enabled ................. True +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_master_weights_and_gradients False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] fp16_mixed_quantize .......... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] global_rank .................. 0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_accumulation_steps .. 2048 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_clipping ............ 1.0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] gradient_predivide_factor .... 1.0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] initial_dynamic_scale ........ 4096 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] loss_scale ................... 0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] memory_breakdown ............. False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_legacy_fusion ...... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_name ............... None +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] optimizer_params ............. None +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pld_enabled .................. False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] pld_params ................... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] prescale_gradients ........... False +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_change_rate ......... 0.001 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_groups .............. 1 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_offset .............. 1000 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_period .............. 1000 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_rounding ............ 0 +[2021-10-22 18:29:07,570] [INFO] [config.py:944:print] quantize_start_bits .......... 16 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_target_bits ......... 8 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_training_enabled .... False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_type ................ 0 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] quantize_verbose ............. False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] scheduler_name ............... None +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] scheduler_params ............. None +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] sparse_attention ............. None +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] sparse_gradients_enabled ..... False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] steps_per_print .............. 2000 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_enabled .......... False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_job_name ......... DeepSpeedJobName +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] tensorboard_output_path ...... +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] train_batch_size ............. 2048 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] train_micro_batch_size_per_gpu 1 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] use_quantizer_kernel ......... False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] wall_clock_breakdown ......... False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] world_size ................... 1 +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_allow_untested_optimizer False +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_config .................. { + "stage": 1, + "contiguous_gradients": true, + "reduce_scatter": true, + "reduce_bucket_size": 5.000000e+08, + "allgather_partitions": true, + "allgather_bucket_size": 5.000000e+08, + "overlap_comm": false, + "load_from_fp32_weights": true, + "elastic_checkpoint": true, + "offload_param": null, + "offload_optimizer": null, + "sub_group_size": 1.000000e+09, + "prefetch_bucket_size": 5.000000e+07, + "param_persistence_threshold": 1.000000e+05, + "max_live_parameters": 1.000000e+09, + "max_reuse_distance": 1.000000e+09, + "gather_fp16_weights_on_model_save": false, + "ignore_unused_parameters": true, + "round_robin_gradients": false, + "legacy_stage1": false +} +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_enabled ................. True +[2021-10-22 18:29:07,571] [INFO] [config.py:944:print] zero_optimization_stage ...... 1 +[2021-10-22 18:29:07,571] [INFO] [config.py:946:print] json = { + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 2.048000e+03, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 1 + }, + "fp16": { + "enabled": true, + "loss_scale": 0, + "loss_scale_window": 500, + "hysteresis": 2, + "min_loss_scale": 1, + "initial_scale_power": 12 + }, + "curriculum_learning": { + "enabled": true, + "curriculum_type": "seqlen", + "min_difficulty": 64, + "max_difficulty": 2.048000e+03, + "schedule_type": "fixed_linear", + "schedule_config": { + "total_curriculum_step": 3.600000e+04, + "difficulty_step": 8 + } + }, + "steps_per_print": 2.000000e+03, + "wall_clock_breakdown": false +} +Using /gpfsdswork/projects/rech/eha/ura81os/.cache/torch_extensions as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0008153915405273438 seconds +[2021-10-22 18:29:07,572] [INFO] [engine.py:93:__init__] CONFIG: micro_batches=2048 micro_batch_size=1 +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=0 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=3 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=1 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=2 STAGE=0 LAYERS=5 [0, 5) STAGE_PARAMS=978291800 (978.292M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=66 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=96 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=98 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=99 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=32 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=65 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=97 STAGE=24 LAYERS=2 [51, 53) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=34 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=35 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=33 STAGE=8 LAYERS=2 [19, 21) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=67 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=64 STAGE=16 LAYERS=2 [35, 37) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=120 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=107 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=105 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=73 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=75 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=74 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=72 STAGE=18 LAYERS=2 [39, 41) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=8 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=11 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=10 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=17 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=18 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=16 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=19 STAGE=4 LAYERS=2 [11, 13) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=51 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=48 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=49 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=50 STAGE=12 LAYERS=2 [27, 29) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=82 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=81 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=80 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=83 STAGE=20 LAYERS=2 [43, 45) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=70 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=31 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=40 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=5 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=57 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=115 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=114 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=113 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=123 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=104 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=106 STAGE=26 LAYERS=2 [55, 57) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=76 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=77 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=78 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=91 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=88 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=100 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=101 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=25 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=27 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=24 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=26 STAGE=6 LAYERS=2 [15, 17) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=9 STAGE=2 LAYERS=2 [7, 9) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=15 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=92 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=95 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=87 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=85 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=68 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=71 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=69 STAGE=17 LAYERS=2 [37, 39) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=52 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=53 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=28 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=30 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=45 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=39 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=43 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=41 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=42 STAGE=10 LAYERS=2 [23, 25) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=6 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=4 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=59 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=60 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=62 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=112 STAGE=28 LAYERS=2 [59, 61) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=118 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=117 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=110 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=111 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=108 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=109 STAGE=27 LAYERS=2 [57, 59) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=121 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=125 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=127 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=21 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=22 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=79 STAGE=19 LAYERS=2 [41, 43) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=90 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=89 STAGE=22 LAYERS=2 [47, 49) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=103 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=102 STAGE=25 LAYERS=2 [53, 55) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=13 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=14 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=12 STAGE=3 LAYERS=2 [9, 11) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=93 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=94 STAGE=23 LAYERS=2 [49, 51) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=84 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=86 STAGE=21 LAYERS=2 [45, 47) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=54 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=55 STAGE=13 LAYERS=2 [29, 31) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=29 STAGE=7 LAYERS=2 [17, 19) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=44 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=36 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=37 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=38 STAGE=9 LAYERS=2 [21, 23) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=7 STAGE=1 LAYERS=2 [5, 7) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=56 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=63 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=116 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=119 STAGE=29 LAYERS=2 [61, 63) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=122 STAGE=30 LAYERS=2 [63, 65) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=124 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=126 STAGE=31 LAYERS=6 [65, 71) STAGE_PARAMS=978315000 (978.315M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=23 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=20 STAGE=5 LAYERS=2 [13, 15) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=46 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=47 STAGE=11 LAYERS=2 [25, 27) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=58 STAGE=14 LAYERS=2 [31, 33) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) +[2021-10-22 18:29:07,965] [INFO] [engine.py:151:__init__] RANK=61 STAGE=15 LAYERS=2 [33, 35) STAGE_PARAMS=807539800 (807.540M) TOTAL_PARAMS=104731203200 (104731.203M) UNIQUE_PARAMS=104048195200 (104048.195M) + > using checkpoint value 6e-05 for learning rate + > using checkpoint value 6e-06 for minimum learning rate + > using checkpoint value 216320 for warmup iterations + > using checkpoint value 600000000 for total number of iterations + > using checkpoint value cosine for decay style +successfully loaded 1 ZeRO state_dicts for rank 86 +successfully loaded 1 ZeRO state_dicts for rank 85 +successfully loaded 1 ZeRO state_dicts for rank 42 +successfully loaded 1 ZeRO state_dicts for rank 84 +successfully loaded 1 ZeRO state_dicts for rank 91 +successfully loaded 1 ZeRO state_dicts for rank 89 +successfully loaded 1 ZeRO state_dicts for rank 88 +successfully loaded 1 ZeRO state_dicts for rank 113 +successfully loaded 1 ZeRO state_dicts for rank 93 +successfully loaded 1 ZeRO state_dicts for rank 96 +successfully loaded 1 ZeRO state_dicts for rank 41 +successfully loaded 1 ZeRO state_dicts for rank 90 +successfully loaded 1 ZeRO state_dicts for rank 34 +successfully loaded 1 ZeRO state_dicts for rank 95 +successfully loaded 1 ZeRO state_dicts for rank 101 +successfully loaded 1 ZeRO state_dicts for rank 115 +successfully loaded 1 ZeRO state_dicts for rank 99 +successfully loaded 1 ZeRO state_dicts for rank 100 +successfully loaded 1 ZeRO state_dicts for rank 10 +successfully loaded 1 ZeRO state_dicts for rank 112 +successfully loaded 1 ZeRO state_dicts for rank 87 +successfully loaded 1 ZeRO state_dicts for rank 40 +successfully loaded 1 ZeRO state_dicts for rank 32 +successfully loaded 1 ZeRO state_dicts for rank 20 +successfully loaded 1 ZeRO state_dicts for rank 38 +successfully loaded 1 ZeRO state_dicts for rank 4 +successfully loaded 1 ZeRO state_dicts for rank 30 +successfully loaded 1 ZeRO state_dicts for rank 97 +successfully loaded 1 ZeRO state_dicts for rank 98 +successfully loaded 1 ZeRO state_dicts for rank 22 +successfully loaded 1 ZeRO state_dicts for rank 35 +loading 1 zero partition checkpoints for rank 85 +successfully loaded 1 ZeRO state_dicts for rank 16 +successfully loaded 1 ZeRO state_dicts for rank 12 +successfully loaded 1 ZeRO state_dicts for rank 24 +successfully loaded 1 ZeRO state_dicts for rank 50 +loading 1 zero partition checkpoints for rank 86 +successfully loaded 1 ZeRO state_dicts for rank 75 +successfully loaded 1 ZeRO state_dicts for rank 39 +successfully loaded 1 ZeRO state_dicts for rank 77 +successfully loaded 1 ZeRO state_dicts for rank 33 +successfully loaded 1 ZeRO state_dicts for rank 118 +successfully loaded 1 ZeRO state_dicts for rank 17 +successfully loaded 1 ZeRO state_dicts for rank 8 +successfully loaded 1 ZeRO state_dicts for rank 66 +successfully loaded 1 ZeRO state_dicts for rank 107 +successfully loaded 1 ZeRO state_dicts for rank 25 +successfully loaded 1 ZeRO state_dicts for rank 83 +loading 1 zero partition checkpoints for rank 91 +successfully loaded 1 ZeRO state_dicts for rank 15 +successfully loaded 1 ZeRO state_dicts for rank 37 +successfully loaded 1 ZeRO state_dicts for rank 49 +successfully loaded 1 ZeRO state_dicts for rank 11 +successfully loaded 1 ZeRO state_dicts for rank 114 +successfully loaded 1 ZeRO state_dicts for rank 79 +successfully loaded 1 ZeRO state_dicts for rank 117 +successfully loaded 1 ZeRO state_dicts for rank 47 +loading 1 zero partition checkpoints for rank 113 +successfully loaded 1 ZeRO state_dicts for rank 116 +successfully loaded 1 ZeRO state_dicts for rank 45 +successfully loaded 1 ZeRO state_dicts for rank 19 +loading 1 zero partition checkpoints for rank 89 +successfully loaded 1 ZeRO state_dicts for rank 52 +successfully loaded 1 ZeRO state_dicts for rank 28 +successfully loaded 1 ZeRO state_dicts for rank 61 +successfully loaded 1 ZeRO state_dicts for rank 36 +successfully loaded 1 ZeRO state_dicts for rank 64 +successfully loaded 1 ZeRO state_dicts for rank 26 +successfully loaded 1 ZeRO state_dicts for rank 29 +successfully loaded 1 ZeRO state_dicts for rank 67 +successfully loaded 1 ZeRO state_dicts for rank 43 +successfully loaded 1 ZeRO state_dicts for rank 14 +successfully loaded 1 ZeRO state_dicts for rank 63 +successfully loaded 1 ZeRO state_dicts for rank 27 +successfully loaded 1 ZeRO state_dicts for rank 111 +successfully loaded 1 ZeRO state_dicts for rank 23 +successfully loaded 1 ZeRO state_dicts for rank 104 +successfully loaded 1 ZeRO state_dicts for rank 103 +successfully loaded 1 ZeRO state_dicts for rank 56 +successfully loaded 1 ZeRO state_dicts for rank 76 +successfully loaded 1 ZeRO state_dicts for rank 80 +successfully loaded 1 ZeRO state_dicts for rank 70 +successfully loaded 1 ZeRO state_dicts for rank 31 +successfully loaded 1 ZeRO state_dicts for rank 21 +successfully loaded 1 ZeRO state_dicts for rank 69 +successfully loaded 1 ZeRO state_dicts for rank 120 +loading 1 zero partition checkpoints for rank 41 +successfully loaded 1 ZeRO state_dicts for rank 119 +successfully loaded 1 ZeRO state_dicts for rank 46 +loading 1 zero partition checkpoints for rank 115 +successfully loaded 1 ZeRO state_dicts for rank 108 +successfully loaded 1 ZeRO state_dicts for rank 78 +successfully loaded 1 ZeRO state_dicts for rank 44 +successfully loaded 1 ZeRO state_dicts for rank 102 +successfully loaded 1 ZeRO state_dicts for rank 121 +successfully loaded 1 ZeRO state_dicts for rank 123 +successfully loaded 1 ZeRO state_dicts for rank 62 +successfully loaded 1 ZeRO state_dicts for rank 9 +successfully loaded 1 ZeRO state_dicts for rank 73 +successfully loaded 1 ZeRO state_dicts for rank 105 +successfully loaded 1 ZeRO state_dicts for rank 68 +successfully loaded 1 ZeRO state_dicts for rank 54 +successfully loaded 1 ZeRO state_dicts for rank 106 +successfully loaded 1 ZeRO state_dicts for rank 71 +loading 1 zero partition checkpoints for rank 42 +loading 1 zero partition checkpoints for rank 87 +successfully loaded 1 ZeRO state_dicts for rank 122 +successfully loaded 1 ZeRO state_dicts for rank 65 +loading 1 zero partition checkpoints for rank 84 +successfully loaded 1 ZeRO state_dicts for rank 109 +loading 1 zero partition checkpoints for rank 99 +successfully loaded 1 ZeRO state_dicts for rank 59 +successfully loaded 1 ZeRO state_dicts for rank 13 +loading 1 zero partition checkpoints for rank 4 +loading 1 zero partition checkpoints for rank 22 +successfully loaded 1 ZeRO state_dicts for rank 18 +loading 1 zero partition checkpoints for rank 40 +loading 1 zero partition checkpoints for rank 93 +loading 1 zero partition checkpoints for rank 35 +loading 1 zero partition checkpoints for rank 50 +successfully loaded 1 ZeRO state_dicts for rank 110 +loading 1 zero partition checkpoints for rank 88 +loading 1 zero partition checkpoints for rank 75 +loading 1 zero partition checkpoints for rank 96 +loading 1 zero partition checkpoints for rank 90 +loading 1 zero partition checkpoints for rank 77 +loading 1 zero partition checkpoints for rank 10 +successfully loaded 1 ZeRO state_dicts for rank 6 +loading 1 zero partition checkpoints for rank 33 +loading 1 zero partition checkpoints for rank 98 +successfully loaded 1 ZeRO state_dicts for rank 5 +loading 1 zero partition checkpoints for rank 34 +loading 1 zero partition checkpoints for rank 107 +loading 1 zero partition checkpoints for rank 66 +successfully loaded 1 ZeRO state_dicts for rank 60 +successfully loaded 1 ZeRO state_dicts for rank 127 +loading 1 zero partition checkpoints for rank 95 +loading 1 zero partition checkpoints for rank 30 +loading 1 zero partition checkpoints for rank 49 +loading 1 zero partition checkpoints for rank 38 +loading 1 zero partition checkpoints for rank 39 +successfully loaded 1 ZeRO state_dicts for rank 124 +loading 1 zero partition checkpoints for rank 17 +successfully loaded 1 ZeRO state_dicts for rank 55 +loading 1 zero partition checkpoints for rank 79 +loading 1 zero partition checkpoints for rank 101 +successfully loaded 1 ZeRO state_dicts for rank 125 +successfully loaded 1 ZeRO state_dicts for rank 74 +loading 1 zero partition checkpoints for rank 45 +successfully loaded 1 ZeRO state_dicts for rank 72 +loading 1 zero partition checkpoints for rank 15 +loading 1 zero partition checkpoints for rank 8 +successfully loaded 1 ZeRO state_dicts for rank 82 +loading 1 zero partition checkpoints for rank 117 +loading 1 zero partition checkpoints for rank 100 +successfully loaded 1 ZeRO state_dicts for rank 7 +loading 1 zero partition checkpoints for rank 112 +loading 1 zero partition checkpoints for rank 111 +successfully loaded 1 ZeRO state_dicts for rank 92 +loading 1 zero partition checkpoints for rank 32 +successfully loaded 1 ZeRO state_dicts for rank 126 +loading 1 zero partition checkpoints for rank 52 +loading 1 zero partition checkpoints for rank 20 +successfully loaded 1 ZeRO state_dicts for rank 53 +loading 1 zero partition checkpoints for rank 70 +successfully loaded 1 ZeRO state_dicts for rank 94 +loading 1 zero partition checkpoints for rank 21 +successfully loaded 1 ZeRO state_dicts for rank 1 +loading 1 zero partition checkpoints for rank 97 +loading 1 zero partition checkpoints for rank 12 +loading 1 zero partition checkpoints for rank 69 +loading 1 zero partition checkpoints for rank 16 +loading 1 zero partition checkpoints for rank 14 +successfully loaded 1 ZeRO state_dicts for rank 48 +loading 1 zero partition checkpoints for rank 31 +loading 1 zero partition checkpoints for rank 119 +successfully loaded 1 ZeRO state_dicts for rank 81 +loading 1 zero partition checkpoints for rank 24loading 1 zero partition checkpoints for rank 26 + +loading 1 zero partition checkpoints for rank 123 +loading 1 zero partition checkpoints for rank 27 +loading 1 zero partition checkpoints for rank 46 +successfully loaded 1 ZeRO state_dicts for rank 0 +loading 1 zero partition checkpoints for rank 121 +loading 1 zero partition checkpoints for rank 105 +loading 1 zero partition checkpoints for rank 103 +loading 1 zero partition checkpoints for rank 118 +loading 1 zero partition checkpoints for rank 62 +loading 1 zero partition checkpoints for rank 83 +loading 1 zero partition checkpoints for rank 25 +loading 1 zero partition checkpoints for rank 65 +successfully loaded 1 ZeRO state_dicts for rank 57 +loading 1 zero partition checkpoints for rank 37 +loading 1 zero partition checkpoints for rank 73 +loading 1 zero partition checkpoints for rank 54 +loading 1 zero partition checkpoints for rank 114 +loading 1 zero partition checkpoints for rank 102 +loading 1 zero partition checkpoints for rank 11 +successfully loaded 1 ZeRO state_dicts for rank 58 +loading 1 zero partition checkpoints for rank 47 +successfully loaded 1 ZeRO state_dicts for rank 3 +successfully loaded 1 ZeRO state_dicts for rank 51 +loading 1 zero partition checkpoints for rank 116 +loading 1 zero partition checkpoints for rank 19 +loading 1 zero partition checkpoints for rank 36 +loading 1 zero partition checkpoints for rank 28 +loading 1 zero partition checkpoints for rank 64 +loading 1 zero partition checkpoints for rank 18 +loading 1 zero partition checkpoints for rank 61 +loading 1 zero partition checkpoints for rank 43 +loading 1 zero partition checkpoints for rank 29 +loading 1 zero partition checkpoints for rank 67 +successfully loaded 1 ZeRO state_dicts for rank 2 +loading 1 zero partition checkpoints for rank 23 +loading 1 zero partition checkpoints for rank 5 +loading 1 zero partition checkpoints for rank 63 +loading 1 zero partition checkpoints for rank 109 +loading 1 zero partition checkpoints for rank 104 +loading 1 zero partition checkpoints for rank 56 +loading 1 zero partition checkpoints for rank 76 +loading 1 zero partition checkpoints for rank 80 +loading 1 zero partition checkpoints for rank 60 +loading 1 zero partition checkpoints for rank 78 +loading 1 zero partition checkpoints for rank 120 +loading 1 zero partition checkpoints for rank 108 +loading 1 zero partition checkpoints for rank 44 +loading 1 zero partition checkpoints for rank 9 +loading 1 zero partition checkpoints for rank 127 +loading 1 zero partition checkpoints for rank 68 +loading 1 zero partition checkpoints for rank 106 +loading 1 zero partition checkpoints for rank 71 +loading 1 zero partition checkpoints for rank 122 +loading 1 zero partition checkpoints for rank 125 +loading 1 zero partition checkpoints for rank 59 +loading 1 zero partition checkpoints for rank 13 +loading 1 zero partition checkpoints for rank 82 +loading 1 zero partition checkpoints for rank 110 +loading 1 zero partition checkpoints for rank 92 +loading 1 zero partition checkpoints for rank 6 +loading 1 zero partition checkpoints for rank 94 +loading 1 zero partition checkpoints for rank 1 +loading 1 zero partition checkpoints for rank 55 +loading 1 zero partition checkpoints for rank 81 +loading 1 zero partition checkpoints for rank 72 +loading 1 zero partition checkpoints for rank 74 +loading 1 zero partition checkpoints for rank 57 +loading 1 zero partition checkpoints for rank 7 +loading 1 zero partition checkpoints for rank 58 +loading 1 zero partition checkpoints for rank 53 +loading 1 zero partition checkpoints for rank 124 +loading 1 zero partition checkpoints for rank 48 +loading 1 zero partition checkpoints for rank 126 +loading 1 zero partition checkpoints for rank 51 +loading 1 zero partition checkpoints for rank 0 + checkpoint version 3.0 +loading 1 zero partition checkpoints for rank 3 +loading 1 zero partition checkpoints for rank 2 + successfully loaded checkpoint from /gpfsscratch/rech/six/commun/checkpoints/tr8b-104B/checkpoints at iteration 34 +time (ms) | load-checkpoint: 13304.77 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 125.2213504estimated model parameters: 125.2213504 + +estimated model parameters: 103.3650944 +estimated model parameters: 125.22432 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 125.2213504 +estimated model parameters: 125.22432 +estimated model parameters: 125.22432 +estimated model parameters: 125.2213504 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters: 125.22432 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.368064 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.368064estimated model parameters without embeddings: 103.368064 + +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.368064 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +/gpfsssd/worksf/projects/rech/six/commun/code/tr8b-104B/Megatron-DeepSpeed/megatron/utils.py:274: UserWarning: Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings + warnings.warn("Parameter count with the embeddings will be inaccurate with PP > 1, as the first and last stage hold several copies of the embeddings") +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944estimated model parameters: 103.3650944 + +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944estimated model parameters without embeddings: 103.3650944 + +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +estimated model parameters without embeddings: 103.3650944 +[after model, optimizer, and learning rate scheduler are built] datetime: 2021-10-22 18:29:21 +> building train, validation, and test datasets ... + > datasets target sizes (minimum size): + train: 600000000 + validation: 3000320 + test: 10240 +> building train, validation, and test datasets for GPT ... + > building dataset index ... + reading sizes... + reading pointers... + reading document index... + creating numpy buffer of mmap... + creating memory view of numpy buffer... + > finished creating indexed dataset in 0.125407 seconds + number of documents: 304230423 + > dataset split: + train: + document indices in [0, 288714672) total of 288714672 documents + validation: + document indices in [288714672, 303926193) total of 15211521 documents + test: + document indices in [303926193, 304230423) total of 304230 documents + > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_doc_idx.npy + > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_sample_idx.npy + > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_train_indexmap_600000000ns_2048sl_43s_shuffle_idx.npy + loaded indexed file in 0.349 seconds + total number of samples: 657686117 + total number of epochs: 5 + > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_doc_idx.npy + > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_sample_idx.npy + > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_valid_indexmap_3000320ns_2048sl_43s_shuffle_idx.npy + loaded indexed file in 0.248 seconds + total number of samples: 6927161 + total number of epochs: 1 + > loading doc-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_doc_idx.npy + > loading sample-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_sample_idx.npy + > loading shuffle-idx mapping from /gpfswork/rech/six/commun/datasets-custom/oscar-en/meg-gpt2_text_document_test_indexmap_10240ns_2048sl_43s_shuffle_idx.npy + loaded indexed file in 0.080 seconds + total number of samples: 137384 + total number of epochs: 1 +> finished creating GPT datasets ... +[after dataloaders are built] datetime: 2021-10-22 18:29:27 +done with setup ... +training ... +time (ms) | model-and-optimizer-setup: 19311.12 | train/valid/test-data-iterators-setup: 5548.88 +Number of parameters: 125.2213504 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 125.2213504 billionNumber of parameters: 125.2213504 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + + +Number of parameters: 125.22432 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billionNumber of parameters: 103.3650944 billion + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 125.22432 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + + +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + + +Number of parameters: 125.22432 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billionNumber of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion + +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.368064 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.368064 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.368064 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 125.22432 billion +Number of parameters: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.368064 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters without embeddings: 103.3650944 billion +Number of parameters: 125.2213504 billion +Number of parameters without embeddings: 103.3650944 billion +[before the start of training step] datetime: 2021-10-22 18:29:27 +[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:547:forward] Activation Checkpointing Information +[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:548:forward] ----Partition Activations False, CPU CHECKPOINTING False +[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:551:forward] ----contiguous Memory Checkpointing False with 64 total layers +[2021-10-22 18:29:27,694] [INFO] [checkpointing.py:554:forward] ----Synchronization False +[2021-10-22 18:29:27,695] [INFO] [checkpointing.py:555:forward] ----Profiling time in checkpointing False +[Rank 2] (after 35 iterations) memory (MB) | allocated: 13203.47900390625 | max allocated: 20667.02783203125 | reserved: 24442.0 | max reserved: 24442.0 +[Rank 6] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 +[Rank 126] (after 35 iterations) memory (MB) | allocated: 13082.6953125 | max allocated: 20546.30126953125 | reserved: 24406.0 | max reserved: 24406.0 +[Rank 10] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 18] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 14] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 26] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 34] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 30] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 22] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 1] (after 35 iterations) memory (MB) | allocated: 13202.11962890625 | max allocated: 20665.66845703125 | reserved: 24442.0 | max reserved: 24442.0 +[Rank 0] (after 35 iterations) memory (MB) | allocated: 13201.28759765625 | max allocated: 20664.83642578125 | reserved: 24442.0 | max reserved: 24442.0 +[Rank 9] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 4] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0[Rank 5] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 + +[Rank 8] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 124] (after 35 iterations) memory (MB) | allocated: 13082.482421875 | max allocated: 20546.08837890625 | reserved: 24406.0 | max reserved: 24406.0 +[Rank 13] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 125] (after 35 iterations) memory (MB) | allocated: 13082.94921875 | max allocated: 20546.55517578125 | reserved: 24406.0 | max reserved: 24406.0 +[Rank 12] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 17] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 16] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 21] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 20] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 25] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 33] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 29] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 28] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 32] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 42] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 46] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 50] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 24] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 38] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 58] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 62] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 70] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 66] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 86] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 54] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 74] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 82] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 78] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 90] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 94] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 98] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 102] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 106] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 110] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 +[Rank 114] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 118] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 41] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 40] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 36] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 37] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 122] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 48] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 49] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 45] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 3] (after 35 iterations) memory (MB) | allocated: 13201.53564453125 | max allocated: 20665.08447265625 | reserved: 24442.0 | max reserved: 24442.0 +[Rank 44] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 7] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20086.0 | max reserved: 20086.0 +[Rank 19] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 11] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 15] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 23] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.7158203125 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 27] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20084.0 | max reserved: 20084.0 +[Rank 31] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 35] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 43] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 39] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 47] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20082.0 | max reserved: 20082.0 +[Rank 51] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 55] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 59] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 67] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 75] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 63] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 71] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 83] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 79] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 57] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 61] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 65] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 60] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 53] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 68] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 69] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 56] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 73] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 64] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 95] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 77] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 91] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 52] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20080.0 | max reserved: 20080.0 +[Rank 72] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 81] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 103] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 85] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 99] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 80] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 89] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0[Rank 88] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 + +[Rank 107] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 87] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 92] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 111] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 +[Rank 97] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 93] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 115] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 96] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 101] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 109] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 104] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 119] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 105] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 108] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20074.0 | max reserved: 20074.0 +[Rank 123] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 117] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 113] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 76] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 121] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 116] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 120] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.37841796875 | reserved: 16994.0 | max reserved: 16994.0 +[Rank 84] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20078.0 | max reserved: 20078.0 +[Rank 100] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.29541015625 | reserved: 20076.0 | max reserved: 20076.0 +[Rank 112] (after 35 iterations) memory (MB) | allocated: 10787.11376953125 | max allocated: 16947.798828125 | reserved: 16994.0 | max reserved: 16994.0 + iteration 35/ 292968 | consumed samples: 71680 | consumed tokens: 4587520 | elapsed time per iteration (ms): 170231.7 | learning rate: 1.988E-05 | global batch size: 2048 | lm loss: 1.020244E+01 | loss scale: 4096.0 | grad norm: 232297.002 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +[Rank 127] (after 35 iterations) memory (MB) | allocated: 13082.57666015625 | max allocated: 20546.1826171875 | reserved: 24406.0 | max reserved: 24406.0 +time (ms) + iteration 36/ 292968 | consumed samples: 73728 | consumed tokens: 4718592 | elapsed time per iteration (ms): 95192.8 | learning rate: 2.045E-05 | global batch size: 2048 | lm loss: 1.179706E+01 | loss scale: 4096.0 | grad norm: 394431.999 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 37/ 292968 | consumed samples: 75776 | consumed tokens: 4849664 | elapsed time per iteration (ms): 94263.9 | learning rate: 2.102E-05 | global batch size: 2048 | lm loss: 1.159876E+01 | loss scale: 4096.0 | grad norm: 309552.600 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 38/ 292968 | consumed samples: 77824 | consumed tokens: 4980736 | elapsed time per iteration (ms): 94613.8 | learning rate: 2.159E-05 | global batch size: 2048 | lm loss: 1.126956E+01 | loss scale: 4096.0 | grad norm: 326011.438 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 39/ 292968 | consumed samples: 79872 | consumed tokens: 5111808 | elapsed time per iteration (ms): 95822.0 | learning rate: 2.215E-05 | global batch size: 2048 | lm loss: 1.047825E+01 | loss scale: 4096.0 | grad norm: 181115.439 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 40/ 292968 | consumed samples: 81920 | consumed tokens: 5242880 | elapsed time per iteration (ms): 96049.2 | learning rate: 2.272E-05 | global batch size: 2048 | lm loss: 1.009597E+01 | loss scale: 4096.0 | grad norm: 105708.713 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 41/ 292968 | consumed samples: 83968 | consumed tokens: 5373952 | elapsed time per iteration (ms): 96857.1 | learning rate: 2.329E-05 | global batch size: 2048 | lm loss: 9.645950E+00 | loss scale: 4096.0 | grad norm: 54189.229 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 42/ 292968 | consumed samples: 86016 | consumed tokens: 5505024 | elapsed time per iteration (ms): 96536.5 | learning rate: 2.386E-05 | global batch size: 2048 | lm loss: 9.366836E+00 | loss scale: 4096.0 | grad norm: 36765.384 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 43/ 292968 | consumed samples: 88064 | consumed tokens: 5636096 | elapsed time per iteration (ms): 97014.4 | learning rate: 2.443E-05 | global batch size: 2048 | lm loss: 9.295312E+00 | loss scale: 4096.0 | grad norm: 101399.317 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 44/ 292968 | consumed samples: 90112 | consumed tokens: 5767168 | elapsed time per iteration (ms): 104666.0 | learning rate: 2.499E-05 | global batch size: 2048 | lm loss: 9.078954E+00 | loss scale: 4096.0 | grad norm: 45212.899 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 45/ 292968 | consumed samples: 92160 | consumed tokens: 5898240 | elapsed time per iteration (ms): 96895.5 | learning rate: 2.556E-05 | global batch size: 2048 | lm loss: 9.004776E+00 | loss scale: 4096.0 | grad norm: 64467.756 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 46/ 292968 | consumed samples: 94208 | consumed tokens: 6029312 | elapsed time per iteration (ms): 95869.1 | learning rate: 2.613E-05 | global batch size: 2048 | lm loss: 8.858628E+00 | loss scale: 4096.0 | grad norm: 34756.107 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 47/ 292968 | consumed samples: 96256 | consumed tokens: 6160384 | elapsed time per iteration (ms): 95837.6 | learning rate: 2.670E-05 | global batch size: 2048 | lm loss: 8.663449E+00 | loss scale: 4096.0 | grad norm: 48155.205 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms) + iteration 48/ 292968 | consumed samples: 98304 | consumed tokens: 6291456 | elapsed time per iteration (ms): 95739.1 | learning rate: 2.727E-05 | global batch size: 2048 | lm loss: 8.545946E+00 | loss scale: 4096.0 | grad norm: 47054.317 | num zeros: 0.0 | curriculum seqlen: 64 | number of skipped iterations: 0 | number of nan iterations: 0 | +time (ms)