aayushraina commited on
Commit
cf24fb8
·
verified ·
1 Parent(s): f383bd4

Upload 4 files

Browse files
Files changed (3) hide show
  1. README.md +6 -0
  2. app_gradio.py +123 -0
  3. requirements.txt +2 -1
README.md CHANGED
@@ -9,12 +9,18 @@ tags:
9
  pipeline_tag: text2text-generation
10
  inference: true
11
  license: mit
 
 
12
  ---
13
 
14
  # Hindi Byte Pair Encoding (BPE) Tokenizer
15
 
16
  A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
17
 
 
 
 
 
18
  ## Project Overview
19
 
20
  This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
 
9
  pipeline_tag: text2text-generation
10
  inference: true
11
  license: mit
12
+ spaces:
13
+ - aayushraina/bpe-hindi
14
  ---
15
 
16
  # Hindi Byte Pair Encoding (BPE) Tokenizer
17
 
18
  A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
19
 
20
+ ## Online Demo
21
+
22
+ Try the tokenizer in your browser: [Hindi BPE Tokenizer Demo](https://huggingface.co/spaces/aayushraina/bpe-hindi)
23
+
24
  ## Project Overview
25
 
26
  This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
app_gradio.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from hindi_bpe import HindiBPE, preprocess_hindi_text
3
+ import pandas as pd
4
+ import plotly.express as px
5
+ import json
6
+
7
+ class TokenizerDemo:
8
+ def __init__(self):
9
+ self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
10
+
11
+ def tokenize_text(self, text: str) -> tuple:
12
+ """Tokenize text and return visualization"""
13
+ # Preprocess
14
+ text = preprocess_hindi_text(text)
15
+
16
+ # Tokenize
17
+ tokens = self.tokenizer.encode(text)
18
+
19
+ # Create visualization
20
+ token_df = pd.DataFrame({
21
+ 'Token': tokens,
22
+ 'Length': [len(token) for token in tokens]
23
+ })
24
+
25
+ fig = px.scatter(token_df,
26
+ x=range(len(tokens)),
27
+ y='Length',
28
+ hover_data=['Token'],
29
+ title='Token Lengths in Sequence')
30
+
31
+ # Calculate statistics
32
+ stats = {
33
+ 'Total Tokens': len(tokens),
34
+ 'Unique Tokens': len(set(tokens)),
35
+ 'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
36
+ 'Compression Ratio': len(text) / sum(len(t) for t in tokens)
37
+ }
38
+
39
+ stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
40
+ for k, v in stats.items())
41
+
42
+ return (
43
+ " ".join(tokens), # Tokenized text
44
+ fig, # Visualization
45
+ stats_str # Statistics
46
+ )
47
+
48
+ def decode_tokens(self, tokens_text: str) -> str:
49
+ """Decode space-separated tokens back to text"""
50
+ tokens = tokens_text.split()
51
+ return self.tokenizer.decode(tokens)
52
+
53
+ def create_demo() -> gr.Interface:
54
+ """Create Gradio interface"""
55
+ demo = TokenizerDemo()
56
+
57
+ with gr.Blocks(title="Hindi BPE Tokenizer") as interface:
58
+ gr.Markdown("""
59
+ # Hindi BPE Tokenizer Demo
60
+
61
+ This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
62
+ Enter Hindi text to see how it gets tokenized and analyze the token distribution.
63
+ """)
64
+
65
+ with gr.Row():
66
+ with gr.Column():
67
+ input_text = gr.Textbox(
68
+ label="Input Hindi Text",
69
+ placeholder="हिंदी में टेक्स्ट दर्ज करें...",
70
+ lines=5
71
+ )
72
+ tokenize_btn = gr.Button("Tokenize")
73
+
74
+ with gr.Column():
75
+ tokens_output = gr.Textbox(
76
+ label="Tokenized Output",
77
+ lines=5
78
+ )
79
+ decode_btn = gr.Button("Decode")
80
+
81
+ original_output = gr.Textbox(
82
+ label="Decoded Text",
83
+ lines=5
84
+ )
85
+
86
+ stats_output = gr.Textbox(
87
+ label="Tokenization Statistics",
88
+ lines=4
89
+ )
90
+
91
+ plot_output = gr.Plot(
92
+ label="Token Length Distribution"
93
+ )
94
+
95
+ # Set up event handlers
96
+ tokenize_btn.click(
97
+ fn=demo.tokenize_text,
98
+ inputs=input_text,
99
+ outputs=[tokens_output, plot_output, stats_output]
100
+ )
101
+
102
+ decode_btn.click(
103
+ fn=demo.decode_tokens,
104
+ inputs=tokens_output,
105
+ outputs=original_output
106
+ )
107
+
108
+ # Add examples
109
+ gr.Examples(
110
+ examples=[
111
+ ["हिंदी भाषा बहुत सुंदर है।"],
112
+ ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
113
+ ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
114
+ ],
115
+ inputs=input_text
116
+ )
117
+
118
+ return interface
119
+
120
+ # Create and launch the demo
121
+ if __name__ == "__main__":
122
+ demo = create_demo()
123
+ demo.launch()
requirements.txt CHANGED
@@ -3,8 +3,9 @@ pandas==1.5.3
3
  plotly==5.13.0
4
  kagglehub
5
  streamlit
6
- beautifulsoup4
7
  huggingface-hub>=0.19.0
8
  tqdm
9
  matplotlib
10
  gitpython>=3.1.0
 
 
3
  plotly==5.13.0
4
  kagglehub
5
  streamlit
6
+ beautifulsoup4
7
  huggingface-hub>=0.19.0
8
  tqdm
9
  matplotlib
10
  gitpython>=3.1.0
11
+ gradio>=4.0.0