File size: 8,277 Bytes
b505f70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import gradio as gr
import jiwer
import pandas as pd
import logging
from typing import List, Optional, Tuple, Dict

# Set up logging configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True,
    handlers=[
        logging.StreamHandler(),
    ]
)
logger = logging.getLogger(__name__)

def calculate_wer_metrics(
    hypothesis: str,
    reference: str,
    normalize: bool = True,
    words_to_filter: Optional[List[str]] = None
) -> Dict:
    """
    Calculate WER metrics between hypothesis and reference texts.
    
    Args:
        hypothesis (str): The hypothesis text
        reference (str): The reference text
        normalize (bool): Whether to normalize texts before comparison
        words_to_filter (List[str], optional): Words to filter out before comparison
        
    Returns:
        dict: Dictionary containing WER metrics
        
    Raises:
        ValueError: If inputs are invalid or result in empty text after processing
    """
    logger.info(f"Calculating WER metrics with inputs - Hypothesis: {hypothesis}, Reference: {reference}")
    
    # Validate inputs
    if not hypothesis.strip() or not reference.strip():
        raise ValueError("Both hypothesis and reference texts must contain non-empty strings")
    
    if normalize:
        # Define basic transformations
        basic_transform = jiwer.Compose([
            jiwer.ExpandCommonEnglishContractions(),
            jiwer.ToLowerCase(),
            jiwer.RemoveMultipleSpaces(),
            jiwer.RemovePunctuation(),
            jiwer.Strip(),
            jiwer.ReduceToListOfListOfWords()
        ])
        
        if words_to_filter and any(words_to_filter):
            def filter_words_transform(words: List[str]) -> List[str]:
                filtered = [word for word in words 
                          if word.lower() not in [w.lower() for w in words_to_filter]]
                if not filtered:
                    raise ValueError("Text is empty after filtering words")
                return filtered
            
            transformation = jiwer.Compose([
                basic_transform,
                filter_words_transform
            ])
        else:
            transformation = basic_transform
        
        # Pre-check the transformed text
        try:
            transformed_ref = transformation(reference)
            transformed_hyp = transformation(hypothesis)
            if not transformed_ref or not transformed_hyp:
                raise ValueError("Text is empty after normalization")
            logger.debug(f"Transformed reference: {transformed_ref}")
            logger.debug(f"Transformed hypothesis: {transformed_hyp}")
        except Exception as e:
            logger.error(f"Transformation error: {str(e)}")
            raise ValueError(f"Error during text transformation: {str(e)}")
        
        measures = jiwer.compute_measures(
            truth=reference,
            hypothesis=hypothesis,
            truth_transform=transformation,
            hypothesis_transform=transformation
        )
    else:
        measures = jiwer.compute_measures(
            truth=reference,
            hypothesis=hypothesis
        )
    
    return measures

def process_inputs(
    reference: str,
    hypothesis: str,
    normalize: bool,
    words_to_filter: str
) -> Tuple[str, str, str, str]:
    """
    Process inputs and calculate WER metrics.
    
    Args:
        reference (str): Reference text
        hypothesis (str): Hypothesis text
        normalize (bool): Whether to normalize text
        words_to_filter (str): Comma-separated words to filter
        
    Returns:
        Tuple[str, str, str, str]: HTML formatted main metrics, error analysis, 
                                  and explanations
    """
    if not reference or not hypothesis:
        return "Please provide both reference and hypothesis texts.", "", "", ""
    
    try:
        filter_words = [word.strip() for word in words_to_filter.split(",")] if words_to_filter else None
        measures = calculate_wer_metrics(
            hypothesis=hypothesis,
            reference=reference,
            normalize=normalize,
            words_to_filter=filter_words
        )
        
        # Format main metrics
        metrics_df = pd.DataFrame({
            'Metric': ['WER', 'MER', 'WIL', 'WIP'],
            'Value': [
                f"{measures['wer']:.3f}",
                f"{measures['mer']:.3f}",
                f"{measures['wil']:.3f}",
                f"{measures['wip']:.3f}"
            ]
        })
        
        # Format error analysis
        error_df = pd.DataFrame({
            'Metric': ['Substitutions', 'Deletions', 'Insertions', 'Hits'],
            'Count': [
                measures['substitutions'],
                measures['deletions'],
                measures['insertions'],
                measures['hits']
            ]
        })
        
        metrics_html = metrics_df.to_html(index=False)
        error_html = error_df.to_html(index=False)
        
        explanation = """
        <h3>Metrics Explanation:</h3>
        <ul>
            <li><b>WER (Word Error Rate)</b>: The percentage of words that were incorrectly predicted</li>
            <li><b>MER (Match Error Rate)</b>: The percentage of words that were incorrectly matched</li>
            <li><b>WIL (Word Information Lost)</b>: The percentage of word information that was lost</li>
            <li><b>WIP (Word Information Preserved)</b>: The percentage of word information that was preserved</li>
        </ul>
        """
        
        return metrics_html, error_html, explanation, ""
        
    except Exception as e:
        error_msg = f"Error calculating WER: {str(e)}"
        logger.error(error_msg)
        return "", "", "", error_msg

def load_example() -> Tuple[str, str]:
    """Load example texts for demonstration."""
    return (
        "the quick brown fox jumps over the lazy dog",
        "the quick brown fox jumped over lazy dog"
    )

def create_interface() -> gr.Blocks:
    """Create the Gradio interface."""
    with gr.Blocks(title="WER Evaluation Tool") as interface:
        gr.Markdown("# Word Error Rate (WER) Evaluation Tool")
        gr.Markdown(
            "This tool helps you evaluate the Word Error Rate (WER) between a reference "
            "text and a hypothesis text. WER is commonly used in speech recognition and "
            "machine translation evaluation."
        )
        
        with gr.Row():
            with gr.Column():
                reference = gr.Textbox(
                    label="Reference Text",
                    placeholder="Enter the reference text here...",
                    lines=5
                )
            with gr.Column():
                hypothesis = gr.Textbox(
                    label="Hypothesis Text",
                    placeholder="Enter the hypothesis text here...",
                    lines=5
                )
        
        with gr.Row():
            normalize = gr.Checkbox(
                label="Normalize text (lowercase, remove punctuation)",
                value=True
            )
            words_to_filter = gr.Textbox(
                label="Words to filter (comma-separated)",
                placeholder="e.g., um, uh, ah"
            )
        
        with gr.Row():
            example_btn = gr.Button("Load Example")
            calculate_btn = gr.Button("Calculate WER", variant="primary")
        
        with gr.Row():
            metrics_output = gr.HTML(label="Main Metrics")
            error_output = gr.HTML(label="Error Analysis")
        
        explanation_output = gr.HTML()
        error_msg_output = gr.HTML()
        
        # Event handlers
        example_btn.click(
            load_example,
            outputs=[reference, hypothesis]
        )
        
        calculate_btn.click(
            process_inputs,
            inputs=[reference, hypothesis, normalize, words_to_filter],
            outputs=[metrics_output, error_output, explanation_output, error_msg_output]
        )
        
    return interface

if __name__ == "__main__":
    logger.info("Application started")
    app = create_interface()
    app.launch()