IndiSent

Runtime error

App Files Files Community

IndiSent / subword-nmt /subword_nmt /bpe_toy.py

hussain-shk

Duplicate from ai4bharat/IndicTrans-Indic2English

e8aeaf1 almost 2 years ago

raw

history blame contribute delete

1.72 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	# Author: Rico Sennrich

	"""Use byte pair encoding (BPE) to learn a variable-length encoding of the vocabulary in a text.
	Unlike the original BPE, it does not compress the plain text, but can be used to reduce the vocabulary
	of a text to a configurable number of symbols, with only a small increase in the number of tokens.
	This is an (inefficient) toy implementation that shows the algorithm. For processing large datasets,
	indexing and incremental updates can be used to speed up the implementation (see learn_bpe.py).

	Reference:
	Rico Sennrich, Barry Haddow and Alexandra Birch (2016). Neural Machine Translation of Rare Words with Subword Units.
	Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL 2016). Berlin, Germany.
	"""


	import re
	import sys
	import collections

	def get_stats(vocab):
	pairs = collections.defaultdict(int)
	for word, freq in vocab.items():
	symbols = word.split()
	for i in range(len(symbols)-1):
	pairs[symbols[i],symbols[i+1]] += freq
	return pairs

	def merge_vocab(pair, v_in):
	v_out = {}
	bigram_pattern = re.escape(' '.join(pair))
	p = re.compile(r'(?<!\S)' + bigram_pattern + r'(?!\S)')
	for word in v_in:
	w_out = p.sub(''.join(pair), word)
	v_out[w_out] = v_in[word]
	return v_out

	vocab = {'l o w</w>' : 5, 'l o w e r</w>' : 2,
	'n e w e s t</w>' : 6, 'w i d e s t</w>' : 3}
	num_merges = 15
	for i in range(num_merges):
	pairs = get_stats(vocab)
	try:
	best = max(pairs, key=pairs.get)
	except ValueError:
	break
	if pairs[best] < 2:
	sys.stderr.write('no pair has frequency > 1. Stopping\n')
	break
	vocab = merge_vocab(best, vocab)
	print(best)