botchagalupe commited on
Commit
f53a618
·
1 Parent(s): 9cb06b8

First Commit

Browse files
Files changed (3) hide show
  1. README.md +3 -3
  2. app.py +49 -0
  3. requirements.txt +4 -0
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
- title: GAI Workshop-200
3
- emoji: 🏢
4
  colorFrom: gray
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
 
1
  ---
2
+ title: GAI Workshop
3
+ emoji: 💻
4
  colorFrom: gray
5
+ colorTo: indigo
6
  sdk: gradio
7
  sdk_version: 4.16.0
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+
5
+ # Constants for default values
6
+ DEFAULT_CHUNK_SIZE = 100
7
+ DEFAULT_CHUNK_OVERLAP = 0
8
+ DEFAULT_NUM_CHUNKS = 10
9
+
10
+ def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
11
+ """
12
+ Tokenizes the input text based on the selected method and provided parameters.
13
+ """
14
+ num_chunks = int(num_chunks)
15
+ output = []
16
+
17
+ # Ensure text is provided
18
+ if not text.strip():
19
+ return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
20
+
21
+ if method == "RecursiveCharacterTextSplitter":
22
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
23
+ tokenized_texts = text_splitter.split_text(text)[:num_chunks]
24
+ for i, chunk in enumerate(tokenized_texts):
25
+ output.append({
26
+ 'Chunk #': i,
27
+ 'Text Chunk': chunk,
28
+ 'Character Count': len(chunk),
29
+ 'Token Count': len(chunk.split())
30
+ })
31
+
32
+ df = pd.DataFrame(output)
33
+ return df
34
+
35
+ iface = gr.Interface(
36
+ fn=tokenize_text,
37
+ inputs=[
38
+ gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
39
+ gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
40
+ gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
41
+ gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
42
+ gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS)
43
+ ],
44
+ outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"], height=900,),
45
+ title="Text Tokenization Tool",
46
+ description="A tool for tokenizing text using different methods."
47
+ )
48
+
49
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ langchain
2
+ gradio
3
+ tiktoken
4
+ sentence-transformers