Spaces:
Sleeping
Sleeping
Commit
·
f53a618
1
Parent(s):
9cb06b8
First Commit
Browse files- README.md +3 -3
- app.py +49 -0
- requirements.txt +4 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
-
title: GAI Workshop
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.16.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
+
title: GAI Workshop
|
3 |
+
emoji: 💻
|
4 |
colorFrom: gray
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.16.0
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
|
5 |
+
# Constants for default values
|
6 |
+
DEFAULT_CHUNK_SIZE = 100
|
7 |
+
DEFAULT_CHUNK_OVERLAP = 0
|
8 |
+
DEFAULT_NUM_CHUNKS = 10
|
9 |
+
|
10 |
+
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
|
11 |
+
"""
|
12 |
+
Tokenizes the input text based on the selected method and provided parameters.
|
13 |
+
"""
|
14 |
+
num_chunks = int(num_chunks)
|
15 |
+
output = []
|
16 |
+
|
17 |
+
# Ensure text is provided
|
18 |
+
if not text.strip():
|
19 |
+
return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])
|
20 |
+
|
21 |
+
if method == "RecursiveCharacterTextSplitter":
|
22 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
|
23 |
+
tokenized_texts = text_splitter.split_text(text)[:num_chunks]
|
24 |
+
for i, chunk in enumerate(tokenized_texts):
|
25 |
+
output.append({
|
26 |
+
'Chunk #': i,
|
27 |
+
'Text Chunk': chunk,
|
28 |
+
'Character Count': len(chunk),
|
29 |
+
'Token Count': len(chunk.split())
|
30 |
+
})
|
31 |
+
|
32 |
+
df = pd.DataFrame(output)
|
33 |
+
return df
|
34 |
+
|
35 |
+
iface = gr.Interface(
|
36 |
+
fn=tokenize_text,
|
37 |
+
inputs=[
|
38 |
+
gr.Dropdown(label="Select Tokenization Method", choices=["RecursiveCharacterTextSplitter"]),
|
39 |
+
gr.Textbox(label="Enter Text", lines=10, placeholder="Type or paste text here."),
|
40 |
+
gr.Number(label="Chunk Size", value=DEFAULT_CHUNK_SIZE),
|
41 |
+
gr.Number(label="Chunk Overlap", value=DEFAULT_CHUNK_OVERLAP),
|
42 |
+
gr.Number(label="Number of Chunks to Display", value=DEFAULT_NUM_CHUNKS)
|
43 |
+
],
|
44 |
+
outputs=gr.Dataframe(headers=["Chunk #", "Text Chunk", "Character Count", "Token Count"], height=900,),
|
45 |
+
title="Text Tokenization Tool",
|
46 |
+
description="A tool for tokenizing text using different methods."
|
47 |
+
)
|
48 |
+
|
49 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
langchain
|
2 |
+
gradio
|
3 |
+
tiktoken
|
4 |
+
sentence-transformers
|