sanchit-gandhi commited on
Commit
eed20cf
·
1 Parent(s): 0a74dbb

repeated n-grams

Browse files
Files changed (2) hide show
  1. app.py +12 -4
  2. requirements.txt +2 -1
app.py CHANGED
@@ -8,6 +8,7 @@ import gradio as gr
8
  from datasets import load_dataset
9
  import pandas as pd
10
  from jiwer import process_words, wer_default
 
11
 
12
 
13
  class Action(Enum):
@@ -63,7 +64,7 @@ target_dtype = np.int16
63
  max_range = np.iinfo(target_dtype).max
64
 
65
 
66
- def get_visualisation(idx, model="large-v2", round_dp=2):
67
  idx -= 1
68
  audio = dataset[idx]["audio"]
69
  array = (audio["array"] * max_range).astype(np.int16)
@@ -83,12 +84,19 @@ def get_visualisation(idx, model="large-v2", round_dp=2):
83
  100 * wer_output.insertions / len(wer_output.references[0]), round_dp
84
  )
85
 
86
- rel_length = round(len(text2.split()) / len(text1.split()), round_dp)
 
 
 
 
 
 
 
87
 
88
  diff = compare_string(text1, text2)
89
  full_text = style_text(diff)
90
 
91
- return (sampling_rate, array), wer_percentage, ier_percentage, rel_length, full_text
92
 
93
 
94
  def get_side_by_side_visualisation(idx):
@@ -136,7 +144,7 @@ if __name__ == "__main__":
136
  "Model",
137
  "Word Error Rate (WER)",
138
  "Insertion Error Rate (IER)",
139
- "Rel length (ref length / tgt length)",
140
  ],
141
  height=1000,
142
  )
 
8
  from datasets import load_dataset
9
  import pandas as pd
10
  from jiwer import process_words, wer_default
11
+ from nltk import ngrams
12
 
13
 
14
  class Action(Enum):
 
64
  max_range = np.iinfo(target_dtype).max
65
 
66
 
67
+ def get_visualisation(idx, model="large-v2", round_dp=2, ngram_degree=5):
68
  idx -= 1
69
  audio = dataset[idx]["audio"]
70
  array = (audio["array"] * max_range).astype(np.int16)
 
84
  100 * wer_output.insertions / len(wer_output.references[0]), round_dp
85
  )
86
 
87
+ all_ngrams = list(ngrams(text2.split(), ngram_degree))
88
+
89
+ unique_ngrams = []
90
+ for ngram in all_ngrams:
91
+ if ngram not in unique_ngrams:
92
+ unique_ngrams.append(ngram)
93
+
94
+ repeated_ngrams = len(all_ngrams) - len(unique_ngrams)
95
 
96
  diff = compare_string(text1, text2)
97
  full_text = style_text(diff)
98
 
99
+ return (sampling_rate, array), wer_percentage, ier_percentage, repeated_ngrams, full_text
100
 
101
 
102
  def get_side_by_side_visualisation(idx):
 
144
  "Model",
145
  "Word Error Rate (WER)",
146
  "Insertion Error Rate (IER)",
147
+ "Repeated 5-grams",
148
  ],
149
  height=1000,
150
  )
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  pandas
2
  datasets[audio]
3
- jiwer
 
 
1
  pandas
2
  datasets[audio]
3
+ jiwer
4
+ nltk