kenlm-sp-jomleh / model.py
mehran's picture
Update model.py
576f1fa
raw
history blame
14.6 kB
import os
import kenlm
import sentencepiece as spm
from tokenizers import normalizers, Regex
# Borrowed from Jomleh dataset code
char_map = {
# Arabic Letter Hamza
# "\u": "\u0621",
# Arabic Letter Alef with Hamza Above
"\uFE83": "\u0623",
"\uFE84": "\u0623",
# Arabic Letter Yeh with Hamza Above
"\uFE89": "\u0626",
"\uFE8A": "\u0626",
"\uFE8B": "\u0626",
"\uFE8C": "\u0626",
# Arabic Letter Waw with Hamza Above
"\uFE85": "\u0624",
"\uFE86": "\u0624",
"\u0676": "\u0624",
# Arabic Letter Alef with Madda Above
"\uFE81": "\u0622", # Arabic letter Alef final form
"\uFE82": "\u0622", # Arabic letter Alef isolated form
# Alef
"\uFB50": "\u0627", # Arabic letter Alef wasla
"\uFE87": "\u0627",
"\u0675": "\u0627",
"\u0625": "\u0627",
"\uFE8D": "\u0627",
"\uFE8E": "\u0627",
"\u1EE00": "\u0627",
"\u1EE80": "\u0627",
# Beh
"\uFE8F": "\u0628",
"\uFE90": "\u0628",
"\uFE91": "\u0628",
"\uFE92": "\u0628",
"\u1EE01": "\u0628",
"\u1EE21": "\u0628",
"\u1EE61": "\u0628",
"\u1EE81": "\u0628",
"\u1EEA1": "\u0628",
# Pe
"\uFB56": "\u067E",
"\uFB57": "\u067E",
"\uFB58": "\u067E",
"\uFB59": "\u067E",
# Teh
"\uFE95": "\u062A",
"\uFE96": "\u062A",
"\uFE97": "\u062A",
"\uFE98": "\u062A",
"\u1EE15": "\u062A",
"\u1EE35": "\u062A",
"\u1EE75": "\u062A",
"\u1EE95": "\u062A",
"\u1EEB5": "\u062A",
# Theh
"\uFE99": "\u062B",
"\uFE9A": "\u062B",
"\uFE9B": "\u062B",
"\uFE9C": "\u062B",
"\u1EE16": "\u062B",
"\u1EE36": "\u062B",
"\u1EE76": "\u062B",
"\u1EE96": "\u062B",
"\u1EEB6": "\u062B",
# Jim
"\uFE9D": "\u062C",
"\uFE9E": "\u062C",
"\uFE9F": "\u062C",
"\uFEA0": "\u062C",
"\u1EE02": "\u062C",
"\u1EE22": "\u062C",
"\u1EE42": "\u062C",
"\u1EE62": "\u062C",
"\u1EE82": "\u062C",
"\u1EEA2": "\u062C",
# Cheh
"\uFB7A": "\u0686",
"\uFB7B": "\u0686",
"\uFB7C": "\u0686",
"\uFB7D": "\u0686",
# Hah
"\uFEA1": "\u062D",
"\uFEA2": "\u062D",
"\uFEA3": "\u062D",
"\uFEA4": "\u062D",
"\u1EE07": "\u062D",
"\u1EE27": "\u062D",
"\u1EE47": "\u062D",
"\u1EE67": "\u062D",
"\u1EE87": "\u062D",
"\u1EEA7": "\u062D",
# Khah
"\uFEA5": "\u062E",
"\uFEA6": "\u062E",
"\uFEA7": "\u062E",
"\uFEA8": "\u062E",
"\u1EE17": "\u062E",
"\u1EE37": "\u062E",
"\u1EE57": "\u062E",
"\u1EE77": "\u062E",
"\u1EE97": "\u062E",
"\u1EEB7": "\u062E",
# Dal
"\uFEA9": "\u062F",
"\uFEAA": "\u062F",
"\u1EE03": "\u062F",
"\u1EE83": "\u062F",
"\u1EEA3": "\u062F",
# Zal
"\uFEAB": "\u0630",
"\uFEAC": "\u0630",
"\u1EE18": "\u0630",
"\u1EE98": "\u0630",
"\u1EEB8": "\u0630",
# Reh
"\uFEAE": "\u0631", # Arabic letter Reh isolated form
"\uFEAD": "\u0631", # Arabic letter Reh final form
"\u0692": "\u0631",
"\u1EE13": "\u0631",
"\u1EE93": "\u0631",
"\u1EEB3": "\u0631",
# Ze
"\uFEAF": "\u0632", #
"\uFEB0": "\u0632", #
"\u1EE06": "\u0632", #
"\u1EE86": "\u0632", #
"\u1EEA6": "\u0632", #
# Jhe
"\uFB8A": "\u0698",
"\uFB8B": "\u0698",
# Seen
"\uFEB1": "\u0633", #
"\uFEB2": "\u0633", #
"\uFEB3": "\u0633", #
"\uFEB4": "\u0633", #
"\u1EE0E": "\u0633", #
"\u1EE2E": "\u0633", #
"\u1EE4E": "\u0633", #
"\u1EE6E": "\u0633", #
"\u1EE8E": "\u0633", #
"\u1EEAE": "\u0633", #
# Sheen
"\uFEB5": "\u0634", #
"\uFEB6": "\u0634", #
"\uFEB7": "\u0634", #
"\uFEB8": "\u0634", #
"\u1EE14": "\u0634", #
"\u1EE34": "\u0634", #
"\u1EE54": "\u0634", #
"\u1EE74": "\u0634", #
"\u1EE94": "\u0634", #
"\u1EEB4": "\u0634", #
# Sad
"\uFEB9": "\u0635", #
"\uFEBA": "\u0635", #
"\uFEBB": "\u0635", #
"\uFEBC": "\u0635", #
"\u1EE11": "\u0635", #
"\u1EE31": "\u0635", #
"\u1EE51": "\u0635", #
"\u1EE71": "\u0635", #
"\u1EE91": "\u0635", #
"\u1EEB1": "\u0635", #
# Zad
"\uFEBD": "\u0636", #
"\uFEBE": "\u0636", #
"\uFEBF": "\u0636", #
"\uFEC0": "\u0636", #
"\u1EE19": "\u0636", #
"\u1EE39": "\u0636", #
"\u1EE59": "\u0636", #
"\u1EE79": "\u0636", #
"\u1EE99": "\u0636", #
"\u1EEB9": "\u0636", #
# Ta
"\uFEC1": "\u0637", #
"\uFEC2": "\u0637", #
"\uFEC3": "\u0637", #
"\uFEC4": "\u0637", #
"\u1EE08": "\u0637", #
"\u1EE68": "\u0637", #
"\u1EE88": "\u0637", #
"\u1EEA8": "\u0637", #
# Za
"\uFEC5": "\u0638", #
"\uFEC6": "\u0638", #
"\uFEC7": "\u0638", #
"\uFEC8": "\u0638", #
"\u1EE1A": "\u0638", #
"\u1EE7A": "\u0638", #
"\u1EE9A": "\u0638", #
"\u1EEBA": "\u0638", #
# Ain
"\uFEC9": "\u0639", #
"\uFECA": "\u0639", #
"\uFECB": "\u0639", #
"\uFECC": "\u0639", #
"\u1EE0F": "\u0639", #
"\u1EE2F": "\u0639", #
"\u1EE4F": "\u0639", #
"\u1EE6F": "\u0639", #
"\u1EE8F": "\u0639", #
"\u1EEAF": "\u0639", #
# Ghain
"\uFECD": "\u063A", #
"\uFECE": "\u063A", #
"\uFECF": "\u063A", #
"\uFED0": "\u063A", #
"\u1EE1B": "\u063A", #
"\u1EE3B": "\u063A", #
"\u1EE5B": "\u063A", #
"\u1EE7B": "\u063A", #
"\u1EE9B": "\u063A", #
"\u1EEBB": "\u063A", #
# Fa
"\uFED1": "\u0641", #
"\uFED2": "\u0641", #
"\uFED3": "\u0641", #
"\uFED4": "\u0641", #
"\u1EE10": "\u0641", #
"\u1EE30": "\u0641", #
"\u1EE70": "\u0641", #
"\u1EE90": "\u0641", #
"\u1EEB0": "\u0641", #
# Qaf
"\uFED5": "\u0642", #
"\uFED6": "\u0642", #
"\uFED7": "\u0642", #
"\uFED8": "\u0642", #
"\u1EE12": "\u0642", #
"\u1EE32": "\u0642", #
"\u1EE52": "\u0642", #
"\u1EE72": "\u0642", #
"\u1EE92": "\u0642", #
"\u1EEB2": "\u0642", #
# Kaf
"\uFB8E": "\u06A9", # Arabic letter Kaf isolated form
"\uFB8F": "\u06A9", # Arabic letter Kaf final form
"\uFB90": "\u06A9", # Arabic letter Kaf initial form
"\uFB91": "\u06A9", # Arabic letter Kaf medial form
"\uFCC8": "\u06A9", # Arabic ligature Dal with Alef final form
"\u0643": "\u06A9",
"\uFED9": "\u06A9",
"\uFEDA": "\u06A9", # Arabic Letter Kaf Final Form
"\uFEDB": "\u06A9", #
"\uFEDC": "\u06A9", #
"\u1EE0A": "\u06A9", #
"\u1EE2A": "\u06A9", #
"\u1EE6A": "\u06A9", #
# Gaf
"\uFB92": "\u06AF", # Arabic letter Gaf isolated form
"\uFB93": "\u06AF", # Arabic letter Gaf final form
"\uFB94": "\u06AF", # Arabic letter Gaf initial form
"\uFB95": "\u06AF", # Arabic letter Gaf medial form
# Lam
"\uFCC9": "\u0644", # Arabic Ligature Lam with Jeem Initial Form
"\uFEDD": "\u0644", # Arabic Letter Lam Isolated Form
"\uFEDE": "\u0644", # Arabic Letter Lam Final Form
"\uFEDF": "\u0644", # Arabic Letter Lam Initial Form
"\uFEE0": "\u0644", # Arabic Letter Lam Medial Form
"\u1EE0B": "\u0644", # Arabic Mathematical Lam
"\u1EE2B": "\u0644", # Arabic Mathematical Initial Lam
"\u1EE4B": "\u0644", # Arabic Mathematical Tailed Lam
"\u1EE8B": "\u0644", # Arabic Mathematical Looped Lam
"\u1EEAB": "\u0644", # Arabic Mathematical Double-Struck Lam
# Mim
"\uFEE1": "\u0645", # Arabic Letter Meem Isolated Form
"\uFEE2": "\u0645", # Arabic Letter Meem Final Form
"\uFEE3": "\u0645", # Arabic Letter Meem Initial Form
"\uFEE4": "\u0645", # Arabic Letter Meem Medial Form
"\u1EE0C": "\u0645", # Arabic Mathematical Meem
"\u1EE2C": "\u0645", # Arabic Mathematical Initial Meem
"\u1EE6C": "\u0645", # Arabic Mathematical Stretched Meem
"\u1EE8C": "\u0645", # Arabic Mathematical Looped Meem
"\u1EEAC": "\u0645", # Arabic Mathematical Double-Struck Meem
# Nun
"\uFEE5": "\u0646", # Arabic Letter Noon Isolated Form
"\uFEE6": "\u0646", # Arabic Letter Noon Final Form
"\uFEE7": "\u0646", # Arabic Letter Noon Initial Form
"\uFEE8": "\u0646", # Arabic Letter Noon Medial Form
"\u1EE0D": "\u0646", # Arabic Mathematical Noon
"\u1EE2D": "\u0646", # Arabic Mathematical Initial Noon
"\u1EE4D": "\u0646", # Arabic Mathematical Tailed Noon
"\u1EE6D": "\u0646", # Arabic Mathematical Stretched Noon
"\u1EE8D": "\u0646", # Arabic Mathematical Looped Noon
"\u1EEAD": "\u0646", # Arabic Mathematical Double-Struck Noon
# Vav
"\u0677": "\u0648", # Arabic letter Mid hamza on waw
"\uFEED": "\u0648", # Arabic Letter Waw Isolated Form
"\uFEEE": "\u0648", # Arabic Letter Waw Final Form
"\u06C6": "\u0648", # Arabic Letter Oe
"\u06C7": "\u0648", # Arabic Letter U
# He
"\u06C0": "\u0647", # Arabic letter Heh with yeh above
"\u0629": "\u0647", # Arabic Letter Teh Marbuta
"\u06BE": "\u0647", # Arabic Letter Heh Doachashmee
"\uFE93": "\u0647", # Arabic Letter Teh Marbuta Isolated Form
"\u06D5": "\u0647", # Arabic Letter Ae
"\uFEE9": "\u0647", # Arabic Letter Heh Isolated Form
"\uFEEA": "\u0647", # Arabic Letter Heh Final Form
"\uFEEB": "\u0647", # Arabic Letter Heh Initial Form
"\uFEEC": "\u0647", # Arabic Letter Heh Medial Form
"\u1EE24": "\u0647", # Arabic Mathematical Initial Heh
"\u1EE64": "\u0647", # Arabic Mathematical Stretched Heh
"\u1EE84": "\u0647", # Arabic Mathematical Looped Heh
# Yeh
"\u06D0": "\u06CC", # Arabic letter Yeh with dot below
"\uFEEF": "\u06CC", # Arabic Letter Alef Maksura Isolated Form
"\uFEF3": "\u06CC", # Arabic Letter Yeh Initial Form
"\uFEF4": "\u06CC", # Arabic Letter Yeh Medial Form
"\u064A": "\u06CC", # Arabic Letter Yeh
"\uFEF1": "\u06CC", # Arabic Letter Yeh Isolated Form
"\u06CE": "\u06CC", # Arabic Letter Yeh with Small V
"\uFBFD": "\u06CC", # Arabic Letter Farsi Yeh Final Form
"\uFBFC": "\u06CC", # Arabic Letter Farsi Yeh Isolated Form
"\uFBFE": "\u06CC", # Arabic Letter Farsi Yeh Initial Form
"\uFBFF": "\u06CC", # Arabic Letter Farsi Yeh Medial Form
"\uFEF0": "\u06CC", # Arabic letter Lam final form
"\uFEF2": "\u06CC", # Arabic letter Lam medial form
"\u063D": "\u06CC",
"\u063E": "\u06CC",
"\u063F": "\u06CC",
"\u06D2": "\u06CC", # Arabic Letter Yeh Barree
"\u064E": "",
"\u064B": "",
"\u064F": "",
"\u064C": "",
"\u0650": "",
"\u064D": "",
"\u0652": "",
"\u0651": "",
"\u0654": "",
"0": "۰",
"1": "۱",
"2": "۲",
"3": "۳",
"4": "۴",
"5": "۵",
"6": "۶",
"7": "۷",
"8": "۸",
"9": "۹",
"٠": "۰",
"١": "۱",
"٢": "۲",
"٣": "۳",
"٤": "۴",
"٥": "۵",
"٦": "۶",
"٧": "۷",
"٨": "۸",
"٩": "۹",
"٬": "،",
",": "،",
";": "؛",
"?": "؟",
"\\": " ",
"…": " غیره ",
"%": " درصد ",
"\u200e": " ",# LEFT-TO-RIGHT
"\u200f": " ",# RIGHT-TO-LEFT
"\u202a": " ",# LEFT-TO-RIGHT EMBEDDING
"\u202b": " ",# RIGHT-TO-LEFT EMBEDDING
"\u2066": " ",# LEFT-TO-RIGHT ISOLATE
"\u2067": " ",# RIGHT-TO-LEFT ISOLATE
"\u2069": " ",# POP DIRECTIONAL ISOLATE
"\ufdef": " ",# Non-standard
"\u00B7": ".",# MIDDLE DOT
"\u2022": " ",# BULLET POINT
"'": " ",
"“": " ",
"”": " ",
"\u00ad": " ",
"\u005f": " ",
"\u002b": " ",
"\u200b": " ",
# ©
"\u00a9": " ",
"\u2014": " ",# Em Dash
"\u2019": " ",# Right Single Quotation Mark
"\uFE0F": "",# Variation Selector-16 (VS16)
"\u007C": " ",# Vertical Line
}
class KenlmModel:
def __init__(
self,
vocabulary_size: str,
ngram: str,
pruning: str,
map_to_farsi_alphabet: bool = True,
normalize_nfd: bool = True,
normalize_numbers: bool = True,
remove_puctuation: bool = True,
remove_non_farsi: bool = True,
):
self.model = kenlm.Model(os.path.join("files", f"jomleh-sp-{vocabulary_size}-o{ngram}-prune{pruning}.probing"))
self.tokenizer = spm.SentencePieceProcessor(os.path.join("files", f"jomleh-sp-{vocabulary_size}.model"))
norm_list = []
if map_to_farsi_alphabet:
norm_list += [normalizers.Replace(key, value) for key, value in char_map.items()]
if normalize_nfd:
norm_list += [normalizers.NFD()]
if normalize_numbers:
norm_list += [normalizers.Replace(Regex("[۱۲۳۴۵۶۷۸۹]"), "۰")]
if remove_puctuation:
norm_list += [normalizers.Replace(Regex("[\\.!؛،؟]"), "")]
if remove_non_farsi:
norm_list += [normalizers.Replace(Regex("[^\u060c\u061b\u061f\u0622\u0623\u0624\u0626\u0627"
"\u0628\u062a\u062b\u062c\u062d\u062e\u062f\u0630\u0631"
"\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063a"
"\u0641\u0642\u0644\u0645\u0646\u0647\u0648\u067e\u0686"
"\u0698\u06a9\u06af\u06cc\u06f0\u06f1\u06f2\u06f3\u06f4"
"\u06f5\u06f6\u06f7\u06f8\u06f9\\s\u200c\\.\\!]"), "")]
norm_list += [normalizers.Strip()]
self.normalizer = normalizers.Sequence(norm_list)
@classmethod
def from_pretrained(
cls,
vocabulary_size: str,
ngram: str,
pruning: str,
map_to_farsi_alphabet: bool = True,
normalize_nfd: bool = True,
normalize_numbers: bool = True,
remove_puctuation: bool = True,
remove_non_farsi: bool = True,
):
return cls(vocabulary_size,
ngram,
pruning,
map_to_farsi_alphabet,
normalize_nfd,
normalize_numbers,
remove_puctuation,
remove_non_farsi)
def score(self, doc: str):
doc = self.normalizer.normalize_str(doc)
doc = ' '.join(self.tokenizer.encode(doc, out_type=str))
return self.model.score(doc)
def perplexity(self, doc: str):
doc = self.normalizer.normalize_str(doc)
doc = ' '.join(self.tokenizer.encode(doc, out_type=str))
log_score = self.model.score(doc)
length = len(doc.split()) + 1
return round(10.0 ** (-log_score / length), 1)