Spaces:
Runtime error
Runtime error
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
import pandas as pd | |
import numpy as np | |
import os | |
from indicnlp import common | |
from indicnlp.common import IndicNlpException | |
from indicnlp import langinfo as li | |
### | |
# Phonetic Information about script characters | |
### | |
""" Phonetic data about all languages except Tamil """ | |
ALL_PHONETIC_DATA=None | |
""" Phonetic data for Tamil """ | |
TAMIL_PHONETIC_DATA=None | |
""" Phonetic vector for all languages except Tamil """ | |
ALL_PHONETIC_VECTORS=None | |
""" Phonetic vector for Tamil """ | |
TAMIL_PHONETIC_VECTORS=None | |
""" Length of phonetic vector """ | |
PHONETIC_VECTOR_LENGTH=38 | |
""" Start offset for the phonetic feature vector in the phonetic data vector """ | |
PHONETIC_VECTOR_START_OFFSET=6 | |
## PHONETIC PROPERTIES in order in which they occur in the vector | |
## This list must be in sync with the keys in the PV_PROP_RANGES dictionary | |
PV_PROP=['basic_type', | |
'vowel_length', | |
'vowel_strength', | |
'vowel_status', | |
'consonant_type', | |
'articulation_place', | |
'aspiration', | |
'voicing', | |
'nasalization', | |
'vowel_horizontal', | |
'vowel_vertical', | |
'vowel_roundness', | |
] | |
### | |
# Bit vector ranges for various properties | |
### | |
PV_PROP_RANGES={ | |
'basic_type': [0,6], | |
'vowel_length': [6,8], | |
'vowel_strength': [8,11], | |
'vowel_status': [11,13], | |
'consonant_type': [13,18], | |
'articulation_place': [18,23], | |
'aspiration': [23,25], | |
'voicing': [25,27], | |
'nasalization': [27,29], | |
'vowel_horizontal': [29,32], | |
'vowel_vertical': [32,36], | |
'vowel_roundness': [36,38], | |
} | |
#### | |
# Indexes into the Phonetic Vector | |
#### | |
PVIDX_BT_VOWEL=0 | |
PVIDX_BT_CONSONANT=1 | |
PVIDX_BT_NUKTA=2 | |
PVIDX_BT_HALANT=3 | |
PVIDX_BT_ANUSVAAR=4 | |
PVIDX_BT_MISC=5 | |
PVIDX_BT_S=PVIDX_BT_VOWEL | |
PVIDX_BT_E=PVIDX_BT_MISC+1 | |
PVIDX_VSTAT_DEP=12 | |
##### | |
# Unicode information about characters | |
##### | |
SCRIPT_OFFSET_START=0 | |
SCRIPT_OFFSET_RANGE=0x80 | |
def init(): | |
""" | |
To be called by library loader, do not call it in your program | |
""" | |
global ALL_PHONETIC_DATA, ALL_PHONETIC_VECTORS, TAMIL_PHONETIC_DATA, TAMIL_PHONETIC_VECTORS, PHONETIC_VECTOR_LENGTH, PHONETIC_VECTOR_START_OFFSET | |
ALL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','all_script_phonetic_data.csv'),encoding='utf-8') | |
TAMIL_PHONETIC_DATA=pd.read_csv(os.path.join(common.get_resources_path(),'script','tamil_script_phonetic_data.csv'),encoding='utf-8') | |
ALL_PHONETIC_VECTORS= ALL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values | |
TAMIL_PHONETIC_VECTORS=TAMIL_PHONETIC_DATA.iloc[:,PHONETIC_VECTOR_START_OFFSET:].values | |
PHONETIC_VECTOR_LENGTH=ALL_PHONETIC_VECTORS.shape[1] | |
def is_supported_language(lang): | |
return lang in list(li.SCRIPT_RANGES.keys()) | |
def get_offset(c,lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException('Language {} not supported'.format(lang)) | |
return ord(c)-li.SCRIPT_RANGES[lang][0] | |
def offset_to_char(off,lang): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
""" | |
if not is_supported_language(lang): | |
raise IndicNlpException('Language {} not supported'.format(lang)) | |
return chr(off+li.SCRIPT_RANGES[lang][0]) | |
def is_indiclang_char(c,lang): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts | |
""" | |
if not is_supported_language(lang): | |
raise IndicNlpException('Language {} not supported'.format(lang)) | |
o=get_offset(c,lang) | |
return (o>=SCRIPT_OFFSET_START and o<SCRIPT_OFFSET_RANGE) \ | |
or ord(c)==li.DANDA or ord(c)==li.DOUBLE_DANDA | |
def in_coordinated_range_offset(c_offset): | |
""" | |
Applicable to Brahmi derived Indic scripts | |
""" | |
return (c_offset>=li.COORDINATED_RANGE_START_INCLUSIVE and c_offset<=li.COORDINATED_RANGE_END_INCLUSIVE) | |
def in_coordinated_range(c,lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException('Language {} not supported'.format(lang)) | |
return in_coordinated_range_offset(get_offset(c,lang)) | |
def get_phonetic_info(lang): | |
if not is_supported_language(lang): | |
raise IndicNlpException('Language {} not supported'.format(lang)) | |
phonetic_data= ALL_PHONETIC_DATA if lang!=li.LC_TA else TAMIL_PHONETIC_DATA | |
phonetic_vectors= ALL_PHONETIC_VECTORS if lang!=li.LC_TA else TAMIL_PHONETIC_VECTORS | |
return (phonetic_data, phonetic_vectors) | |
def invalid_vector(): | |
## TODO: check if np datatype is correct? | |
return np.array([0]*PHONETIC_VECTOR_LENGTH) | |
def get_phonetic_feature_vector(c,lang): | |
offset=get_offset(c,lang) | |
if not in_coordinated_range_offset(offset): | |
return invalid_vector() | |
phonetic_data, phonetic_vectors= get_phonetic_info(lang) | |
if phonetic_data.iloc[offset]['Valid Vector Representation']==0: | |
return invalid_vector() | |
return phonetic_vectors[offset] | |
def get_phonetic_feature_vector_offset(offset,lang): | |
if not in_coordinated_range_offset(offset): | |
return invalid_vector() | |
phonetic_data, phonetic_vectors= get_phonetic_info(lang) | |
if phonetic_data.iloc[offset]['Valid Vector Representation']==0: | |
return invalid_vector() | |
return phonetic_vectors[offset] | |
### Unary operations on vectors | |
def is_valid(v): | |
return np.sum(v)>0 | |
def is_vowel(v): | |
return v[PVIDX_BT_VOWEL]==1 | |
def is_consonant(v): | |
return v[PVIDX_BT_CONSONANT]==1 | |
def is_halant(v): | |
return v[PVIDX_BT_HALANT]==1 | |
def is_nukta(v): | |
return v[PVIDX_BT_NUKTA]==1 | |
def is_anusvaar(v): | |
return v[PVIDX_BT_ANUSVAAR]==1 | |
def is_misc(v): | |
return v[PVIDX_BT_MISC]==1 | |
def is_dependent_vowel(v): | |
return is_vowel(v) and v[PVIDX_VSTAT_DEP]==1 | |
def is_plosive(v): | |
return is_consonant(v) and get_property_vector(v,'consonant_type')[0]==1 | |
### Binary operations on phonetic vectors | |
def or_vectors(v1,v2): | |
return np.array([ 1 if (b1+b2)>=1 else 0 for b1,b2 in zip(v1,v2) ]) | |
def xor_vectors(v1,v2): | |
return np.array([ 1 if b1!=b2 else 0 for b1,b2 in zip(v1,v2) ]) | |
### Getting properties from phonetic vectors | |
def get_property_vector(v,prop_name): | |
return v[PV_PROP_RANGES[prop_name][0]:PV_PROP_RANGES[prop_name][1]] | |
def get_property_value(v,prop_name): | |
factor_bits=get_property_vector(v,prop_name).tolist() | |
v=0 | |
c=1 | |
for b in factor_bits[::-1]: | |
v+=(c*b) | |
c=c*2.0 | |
return int(v) | |
def lcsr_indic(srcw,tgtw,slang,tlang): | |
""" | |
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. | |
This works for Indic scripts by mapping both languages to a common script | |
srcw: source language string | |
tgtw: source language string | |
slang: source language | |
tlang: target language | |
""" | |
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) | |
for si,sc in enumerate(srcw,1): | |
for ti,tc in enumerate(tgtw,1): | |
so=get_offset(sc,slang) | |
to=get_offset(tc,tlang) | |
if in_coordinated_range_offset(so) and in_coordinated_range_offset(to) and so==to: | |
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 | |
elif not (in_coordinated_range_offset(so) or in_coordinated_range_offset(to)) and sc==tc: | |
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 | |
else: | |
score_mat[si,ti]= max( | |
score_mat[si,ti-1], | |
score_mat[si-1,ti]) | |
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) | |
def lcsr_any(srcw,tgtw): | |
""" | |
LCSR computation if both languages have the same script | |
""" | |
score_mat=np.zeros((len(srcw)+1,len(tgtw)+1)) | |
for si,sc in enumerate(srcw,1): | |
for ti,tc in enumerate(tgtw,1): | |
if sc==tc: | |
score_mat[si,ti]=score_mat[si-1,ti-1]+1.0 | |
else: | |
score_mat[si,ti]= max( | |
score_mat[si,ti-1], | |
score_mat[si-1,ti]) | |
return (score_mat[-1,-1]/float(max(len(srcw),len(tgtw))),float(len(srcw)),float(len(tgtw))) | |
def lcsr(srcw,tgtw,slang,tlang): | |
""" | |
compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level. | |
srcw: source language string | |
tgtw: source language string | |
slang: source language | |
tlang: target language | |
""" | |
if slang==tlang or not is_supported_language(slang) or not is_supported_language(tlang): | |
return lcsr_any(srcw,tgtw,slang,tlang) | |
else: | |
return lcsr_indic(srcw,tgtw) | |