Spaces:
Sleeping
Sleeping
File size: 3,287 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# Natural Language Toolkit: Word List Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
# Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
import re
from collections import defaultdict, namedtuple
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.corpus.reader.wordlist import WordListCorpusReader
from nltk.tokenize import line_tokenize
PanlexLanguage = namedtuple(
"PanlexLanguage",
[
"panlex_uid", # (1) PanLex UID
"iso639", # (2) ISO 639 language code
"iso639_type", # (3) ISO 639 language type, see README
"script", # (4) normal scripts of expressions
"name", # (5) PanLex default name
"langvar_uid", # (6) UID of the language variety in which the default name is an expression
],
)
class PanlexSwadeshCorpusReader(WordListCorpusReader):
"""
This is a class to read the PanLex Swadesh list from
David Kamholz, Jonathan Pool, and Susan M. Colowick (2014).
PanLex: Building a Resource for Panlingual Lexical Translation.
In LREC. http://www.lrec-conf.org/proceedings/lrec2014/pdf/1029_Paper.pdf
License: CC0 1.0 Universal
https://creativecommons.org/publicdomain/zero/1.0/legalcode
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Find the swadesh size using the fileids' path.
self.swadesh_size = re.match(r"swadesh([0-9].*)\/", self.fileids()[0]).group(1)
self._languages = {lang.panlex_uid: lang for lang in self.get_languages()}
self._macro_langauges = self.get_macrolanguages()
def license(self):
return "CC0 1.0 Universal"
def language_codes(self):
return self._languages.keys()
def get_languages(self):
for line in self.raw(f"langs{self.swadesh_size}.txt").split("\n"):
if not line.strip(): # Skip empty lines.
continue
yield PanlexLanguage(*line.strip().split("\t"))
def get_macrolanguages(self):
macro_langauges = defaultdict(list)
for lang in self._languages.values():
macro_langauges[lang.iso639].append(lang.panlex_uid)
return macro_langauges
def words_by_lang(self, lang_code):
"""
:return: a list of list(str)
"""
fileid = f"swadesh{self.swadesh_size}/{lang_code}.txt"
return [concept.split("\t") for concept in self.words(fileid)]
def words_by_iso639(self, iso63_code):
"""
:return: a list of list(str)
"""
fileids = [
f"swadesh{self.swadesh_size}/{lang_code}.txt"
for lang_code in self._macro_langauges[iso63_code]
]
return [
concept.split("\t") for fileid in fileids for concept in self.words(fileid)
]
def entries(self, fileids=None):
"""
:return: a tuple of words for the specified fileids.
"""
if not fileids:
fileids = self.fileids()
wordlists = [self.words(f) for f in fileids]
return list(zip(*wordlists))
|