Spaces:
Running
Running
import sys | |
sys.path.insert(0, "GPT_SoVITS") | |
from text import cleaned_text_to_sequence | |
from text.cleaner import clean_text | |
import LangSegment | |
import re | |
splits = {",", "。", "?", "!", ",", ".", "?", "!", "~", ":", ":", "—", "…", } | |
def clean_text_inf(text, language): | |
phones, word2ph, norm_text = clean_text(text, language) | |
phones = cleaned_text_to_sequence(phones) | |
return phones, word2ph, norm_text | |
def get_first(text): | |
pattern = "[" + "".join(re.escape(sep) for sep in splits) + "]" | |
text = re.split(pattern, text)[0].strip() | |
return text | |
def splite_en_inf(sentence, language): | |
""" | |
Split the input sentence into a list of text segments and language tags. | |
Args: | |
sentence (str): The input sentence to be split. | |
language (str): The language tag of the input sentence. | |
Returns: | |
tuple: A tuple containing two lists - textlist and langlist. | |
- textlist: A list of text segments extracted from the input sentence. | |
- langlist: A list of language tags corresponding to each text segment. | |
""" | |
pattern = re.compile(r'[a-zA-Z ]+') | |
textlist = [] | |
langlist = [] | |
pos = 0 | |
for match in pattern.finditer(sentence): | |
start, end = match.span() | |
if start > pos: | |
textlist.append(sentence[pos:start]) | |
langlist.append(language) | |
textlist.append(sentence[start:end]) | |
langlist.append("en") | |
pos = end | |
if pos < len(sentence): | |
textlist.append(sentence[pos:]) | |
langlist.append(language) | |
# Merge punctuation into previous word | |
for i in range(len(textlist)-1, 0, -1): | |
if re.match(r'^[\W_]+$', textlist[i]): | |
textlist[i-1] += textlist[i] | |
del textlist[i] | |
del langlist[i] | |
# Merge consecutive words with the same language tag | |
i = 0 | |
while i < len(langlist) - 1: | |
if langlist[i] == langlist[i+1]: | |
textlist[i] += textlist[i+1] | |
del textlist[i+1] | |
del langlist[i+1] | |
else: | |
i += 1 | |
return textlist, langlist | |
def merge_short_text_in_array(texts, threshold): | |
if (len(texts)) < 2: | |
return texts | |
result = [] | |
text = "" | |
for ele in texts: | |
text += ele | |
if len(text) >= threshold: | |
result.append(text) | |
text = "" | |
if (len(text) > 0): | |
if len(result) == 0: | |
result.append(text) | |
else: | |
result[len(result) - 1] += text | |
return result | |
def auto_cut(inp): | |
# if not re.search(r'[^\w\s]', inp[-1]): | |
# inp += '。' | |
inp = inp.strip("\n") | |
split_punds = r'[?!。?!~:]' | |
if inp[-1] not in split_punds: | |
inp+="。" | |
items = re.split(f'({split_punds})', inp) | |
items = ["".join(group) for group in zip(items[::2], items[1::2])] | |
def process_commas(text): | |
separators = [',', ',', '、', '——', '…'] | |
count = 0 | |
processed_text = "" | |
for char in text: | |
processed_text += char | |
if char in separators: | |
if count > 12: | |
processed_text += '\n' | |
count = 0 | |
else: | |
count += 1 # 对于非分隔符字符,增加计数 | |
return processed_text | |
final_items=[process_commas(item) for item in items] | |
return "\n".join(final_items) | |