Spaces:
Runtime error
Runtime error
import gradio as gr | |
import requests | |
import logging | |
import os | |
import json | |
import shutil | |
import glob | |
import queue | |
import lancedb | |
from datetime import datetime | |
from dotenv import load_dotenv, set_key | |
import yaml | |
import pandas as pd | |
from typing import List, Optional | |
from pydantic import BaseModel | |
# Set up logging | |
log_queue = queue.Queue() | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
load_dotenv('indexing/.env') | |
API_BASE_URL = os.getenv('API_BASE_URL', 'http://localhost:8012') | |
LLM_API_BASE = os.getenv('LLM_API_BASE', 'http://localhost:11434') | |
EMBEDDINGS_API_BASE = os.getenv('EMBEDDINGS_API_BASE', 'http://localhost:11434') | |
ROOT_DIR = os.getenv('ROOT_DIR', 'indexing') | |
# Data models | |
class IndexingRequest(BaseModel): | |
llm_model: str | |
embed_model: str | |
llm_api_base: str | |
embed_api_base: str | |
root: str | |
verbose: bool = False | |
nocache: bool = False | |
resume: Optional[str] = None | |
reporter: str = "rich" | |
emit: List[str] = ["parquet"] | |
custom_args: Optional[str] = None | |
class PromptTuneRequest(BaseModel): | |
root: str = "./{ROOT_DIR}" | |
domain: Optional[str] = None | |
method: str = "random" | |
limit: int = 15 | |
language: Optional[str] = None | |
max_tokens: int = 2000 | |
chunk_size: int = 200 | |
no_entity_types: bool = False | |
output: str = "./{ROOT_DIR}/prompts" | |
class QueueHandler(logging.Handler): | |
def __init__(self, log_queue): | |
super().__init__() | |
self.log_queue = log_queue | |
def emit(self, record): | |
self.log_queue.put(self.format(record)) | |
queue_handler = QueueHandler(log_queue) | |
logging.getLogger().addHandler(queue_handler) | |
def update_logs(): | |
logs = [] | |
while not log_queue.empty(): | |
logs.append(log_queue.get()) | |
return "\n".join(logs) | |
##########SETTINGS################ | |
def load_settings(): | |
config_path = os.getenv('GRAPHRAG_CONFIG', 'config.yaml') | |
if os.path.exists(config_path): | |
with open(config_path, 'r') as config_file: | |
config = yaml.safe_load(config_file) | |
else: | |
config = {} | |
settings = { | |
'llm_model': os.getenv('LLM_MODEL', config.get('llm_model')), | |
'embedding_model': os.getenv('EMBEDDINGS_MODEL', config.get('embedding_model')), | |
'community_level': int(os.getenv('COMMUNITY_LEVEL', config.get('community_level', 2))), | |
'token_limit': int(os.getenv('TOKEN_LIMIT', config.get('token_limit', 4096))), | |
'api_key': os.getenv('GRAPHRAG_API_KEY', config.get('api_key')), | |
'api_base': os.getenv('LLM_API_BASE', config.get('api_base')), | |
'embeddings_api_base': os.getenv('EMBEDDINGS_API_BASE', config.get('embeddings_api_base')), | |
'api_type': os.getenv('API_TYPE', config.get('api_type', 'openai')), | |
} | |
return settings | |
#######FILE_MANAGEMENT############## | |
def list_output_files(root_dir): | |
output_dir = os.path.join(root_dir, "output") | |
files = [] | |
for root, _, filenames in os.walk(output_dir): | |
for filename in filenames: | |
files.append(os.path.join(root, filename)) | |
return files | |
def update_file_list(): | |
files = list_input_files() | |
return gr.update(choices=[f["path"] for f in files]) | |
def update_file_content(file_path): | |
if not file_path: | |
return "" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
return content | |
except Exception as e: | |
logging.error(f"Error reading file: {str(e)}") | |
return f"Error reading file: {str(e)}" | |
def list_output_folders(): | |
output_dir = os.path.join(ROOT_DIR, "output") | |
folders = [f for f in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, f))] | |
return sorted(folders, reverse=True) | |
def update_output_folder_list(): | |
folders = list_output_folders() | |
return gr.update(choices=folders, value=folders[0] if folders else None) | |
def list_folder_contents(folder_name): | |
folder_path = os.path.join(ROOT_DIR, "output", folder_name, "artifacts") | |
contents = [] | |
if os.path.exists(folder_path): | |
for item in os.listdir(folder_path): | |
item_path = os.path.join(folder_path, item) | |
if os.path.isdir(item_path): | |
contents.append(f"[DIR] {item}") | |
else: | |
_, ext = os.path.splitext(item) | |
contents.append(f"[{ext[1:].upper()}] {item}") | |
return contents | |
def update_folder_content_list(folder_name): | |
if isinstance(folder_name, list) and folder_name: | |
folder_name = folder_name[0] | |
elif not folder_name: | |
return gr.update(choices=[]) | |
contents = list_folder_contents(folder_name) | |
return gr.update(choices=contents) | |
def handle_content_selection(folder_name, selected_item): | |
if isinstance(selected_item, list) and selected_item: | |
selected_item = selected_item[0] # Take the first item if it's a list | |
if isinstance(selected_item, str) and selected_item.startswith("[DIR]"): | |
dir_name = selected_item[6:] # Remove "[DIR] " prefix | |
sub_contents = list_folder_contents(os.path.join(ROOT_DIR, "output", folder_name, dir_name)) | |
return gr.update(choices=sub_contents), "", "" | |
elif isinstance(selected_item, str): | |
file_name = selected_item.split("] ")[1] if "]" in selected_item else selected_item # Remove file type prefix if present | |
file_path = os.path.join(ROOT_DIR, "output", folder_name, "artifacts", file_name) | |
file_size = os.path.getsize(file_path) | |
file_type = os.path.splitext(file_name)[1] | |
file_info = f"File: {file_name}\nSize: {file_size} bytes\nType: {file_type}" | |
content = read_file_content(file_path) | |
return gr.update(), file_info, content | |
else: | |
return gr.update(), "", "" | |
def initialize_selected_folder(folder_name): | |
if not folder_name: | |
return "Please select a folder first.", gr.update(choices=[]) | |
folder_path = os.path.join(ROOT_DIR, "output", folder_name, "artifacts") | |
if not os.path.exists(folder_path): | |
return f"Artifacts folder not found in '{folder_name}'.", gr.update(choices=[]) | |
contents = list_folder_contents(folder_path) | |
return f"Folder '{folder_name}/artifacts' initialized with {len(contents)} items.", gr.update(choices=contents) | |
def upload_file(file): | |
if file is not None: | |
input_dir = os.path.join(ROOT_DIR, 'input') | |
os.makedirs(input_dir, exist_ok=True) | |
# Get the original filename from the uploaded file | |
original_filename = file.name | |
# Create the destination path | |
destination_path = os.path.join(input_dir, os.path.basename(original_filename)) | |
# Move the uploaded file to the destination path | |
shutil.move(file.name, destination_path) | |
logging.info(f"File uploaded and moved to: {destination_path}") | |
status = f"File uploaded: {os.path.basename(original_filename)}" | |
else: | |
status = "No file uploaded" | |
# Get the updated file list | |
updated_file_list = [f["path"] for f in list_input_files()] | |
return status, gr.update(choices=updated_file_list), update_logs() | |
def list_input_files(): | |
input_dir = os.path.join(ROOT_DIR, 'input') | |
files = [] | |
if os.path.exists(input_dir): | |
files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))] | |
return [{"name": f, "path": os.path.join(input_dir, f)} for f in files] | |
def delete_file(file_path): | |
try: | |
os.remove(file_path) | |
logging.info(f"File deleted: {file_path}") | |
status = f"File deleted: {os.path.basename(file_path)}" | |
except Exception as e: | |
logging.error(f"Error deleting file: {str(e)}") | |
status = f"Error deleting file: {str(e)}" | |
# Get the updated file list | |
updated_file_list = [f["path"] for f in list_input_files()] | |
return status, gr.update(choices=updated_file_list), update_logs() | |
def read_file_content(file_path): | |
try: | |
if file_path.endswith('.parquet'): | |
df = pd.read_parquet(file_path) | |
# Get basic information about the DataFrame | |
info = f"Parquet File: {os.path.basename(file_path)}\n" | |
info += f"Rows: {len(df)}, Columns: {len(df.columns)}\n\n" | |
info += "Column Names:\n" + "\n".join(df.columns) + "\n\n" | |
# Display first few rows | |
info += "First 5 rows:\n" | |
info += df.head().to_string() + "\n\n" | |
# Display basic statistics | |
info += "Basic Statistics:\n" | |
info += df.describe().to_string() | |
return info | |
else: | |
with open(file_path, 'r', encoding='utf-8', errors='replace') as file: | |
content = file.read() | |
return content | |
except Exception as e: | |
logging.error(f"Error reading file: {str(e)}") | |
return f"Error reading file: {str(e)}" | |
def save_file_content(file_path, content): | |
try: | |
with open(file_path, 'w') as file: | |
file.write(content) | |
logging.info(f"File saved: {file_path}") | |
status = f"File saved: {os.path.basename(file_path)}" | |
except Exception as e: | |
logging.error(f"Error saving file: {str(e)}") | |
status = f"Error saving file: {str(e)}" | |
return status, update_logs() | |
def manage_data(): | |
db = lancedb.connect(f"{ROOT_DIR}/lancedb") | |
tables = db.table_names() | |
table_info = "" | |
if tables: | |
table = db[tables[0]] | |
table_info = f"Table: {tables[0]}\nSchema: {table.schema}" | |
input_files = list_input_files() | |
return { | |
"database_info": f"Tables: {', '.join(tables)}\n\n{table_info}", | |
"input_files": input_files | |
} | |
def find_latest_graph_file(root_dir): | |
pattern = os.path.join(root_dir, "output", "*", "artifacts", "*.graphml") | |
graph_files = glob.glob(pattern) | |
if not graph_files: | |
# If no files found, try excluding .DS_Store | |
output_dir = os.path.join(root_dir, "output") | |
run_dirs = [d for d in os.listdir(output_dir) if os.path.isdir(os.path.join(output_dir, d)) and d != ".DS_Store"] | |
if run_dirs: | |
latest_run = max(run_dirs) | |
pattern = os.path.join(root_dir, "output", latest_run, "artifacts", "*.graphml") | |
graph_files = glob.glob(pattern) | |
if not graph_files: | |
return None | |
# Sort files by modification time, most recent first | |
latest_file = max(graph_files, key=os.path.getmtime) | |
return latest_file | |
def find_latest_output_folder(): | |
root_dir =f"{ROOT_DIR}/output" | |
folders = [f for f in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, f))] | |
if not folders: | |
raise ValueError("No output folders found") | |
# Sort folders by creation time, most recent first | |
sorted_folders = sorted(folders, key=lambda x: os.path.getctime(os.path.join(root_dir, x)), reverse=True) | |
latest_folder = None | |
timestamp = None | |
for folder in sorted_folders: | |
try: | |
# Try to parse the folder name as a timestamp | |
timestamp = datetime.strptime(folder, "%Y%m%d-%H%M%S") | |
latest_folder = folder | |
break | |
except ValueError: | |
# If the folder name is not a valid timestamp, skip it | |
continue | |
if latest_folder is None: | |
raise ValueError("No valid timestamp folders found") | |
latest_path = os.path.join(root_dir, latest_folder) | |
artifacts_path = os.path.join(latest_path, "artifacts") | |
if not os.path.exists(artifacts_path): | |
raise ValueError(f"Artifacts folder not found in {latest_path}") | |
return latest_path, latest_folder | |
def initialize_data(): | |
global entity_df, relationship_df, text_unit_df, report_df, covariate_df | |
tables = { | |
"entity_df": "create_final_nodes", | |
"relationship_df": "create_final_edges", | |
"text_unit_df": "create_final_text_units", | |
"report_df": "create_final_reports", | |
"covariate_df": "create_final_covariates" | |
} | |
timestamp = None # Initialize timestamp to None | |
try: | |
latest_output_folder, timestamp = find_latest_output_folder() | |
artifacts_folder = os.path.join(latest_output_folder, "artifacts") | |
for df_name, file_prefix in tables.items(): | |
file_pattern = os.path.join(artifacts_folder, f"{file_prefix}*.parquet") | |
matching_files = glob.glob(file_pattern) | |
if matching_files: | |
latest_file = max(matching_files, key=os.path.getctime) | |
df = pd.read_parquet(latest_file) | |
globals()[df_name] = df | |
logging.info(f"Successfully loaded {df_name} from {latest_file}") | |
else: | |
logging.warning(f"No matching file found for {df_name} in {artifacts_folder}. Initializing as an empty DataFrame.") | |
globals()[df_name] = pd.DataFrame() | |
except Exception as e: | |
logging.error(f"Error initializing data: {str(e)}") | |
for df_name in tables.keys(): | |
globals()[df_name] = pd.DataFrame() | |
return timestamp | |
# Call initialize_data and store the timestamp | |
current_timestamp = initialize_data() | |
###########MODELS################## | |
def normalize_api_base(api_base: str) -> str: | |
"""Normalize the API base URL by removing trailing slashes and /v1 or /api suffixes.""" | |
api_base = api_base.rstrip('/') | |
if api_base.endswith('/v1') or api_base.endswith('/api'): | |
api_base = api_base[:-3] | |
return api_base | |
def is_ollama_api(base_url: str) -> bool: | |
"""Check if the given base URL is for Ollama API.""" | |
try: | |
response = requests.get(f"{normalize_api_base(base_url)}/api/tags") | |
return response.status_code == 200 | |
except requests.RequestException: | |
return False | |
def get_ollama_models(base_url: str) -> List[str]: | |
"""Fetch available models from Ollama API.""" | |
try: | |
response = requests.get(f"{normalize_api_base(base_url)}/api/tags") | |
response.raise_for_status() | |
models = response.json().get('models', []) | |
return [model['name'] for model in models] | |
except requests.RequestException as e: | |
logger.error(f"Error fetching Ollama models: {str(e)}") | |
return [] | |
def get_openai_compatible_models(base_url: str) -> List[str]: | |
"""Fetch available models from OpenAI-compatible API.""" | |
try: | |
response = requests.get(f"{normalize_api_base(base_url)}/v1/models") | |
response.raise_for_status() | |
models = response.json().get('data', []) | |
return [model['id'] for model in models] | |
except requests.RequestException as e: | |
logger.error(f"Error fetching OpenAI-compatible models: {str(e)}") | |
return [] | |
def get_local_models(base_url: str) -> List[str]: | |
"""Get available models based on the API type.""" | |
if is_ollama_api(base_url): | |
return get_ollama_models(base_url) | |
else: | |
return get_openai_compatible_models(base_url) | |
def get_model_params(base_url: str, model_name: str) -> dict: | |
"""Get model parameters for Ollama models.""" | |
if is_ollama_api(base_url): | |
try: | |
response = requests.post(f"{normalize_api_base(base_url)}/api/show", json={"name": model_name}) | |
response.raise_for_status() | |
model_info = response.json() | |
return model_info.get('parameters', {}) | |
except requests.RequestException as e: | |
logger.error(f"Error fetching Ollama model parameters: {str(e)}") | |
return {} | |
#########API########### | |
def start_indexing(request: IndexingRequest): | |
url = f"{API_BASE_URL}/v1/index" | |
try: | |
response = requests.post(url, json=request.dict()) | |
response.raise_for_status() | |
result = response.json() | |
return result['message'], gr.update(interactive=False), gr.update(interactive=True) | |
except requests.RequestException as e: | |
logger.error(f"Error starting indexing: {str(e)}") | |
return f"Error: {str(e)}", gr.update(interactive=True), gr.update(interactive=False) | |
def check_indexing_status(): | |
url = f"{API_BASE_URL}/v1/index_status" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
result = response.json() | |
return result['status'], "\n".join(result['logs']) | |
except requests.RequestException as e: | |
logger.error(f"Error checking indexing status: {str(e)}") | |
return "Error", f"Failed to check indexing status: {str(e)}" | |
def start_prompt_tuning(request: PromptTuneRequest): | |
url = f"{API_BASE_URL}/v1/prompt_tune" | |
try: | |
response = requests.post(url, json=request.dict()) | |
response.raise_for_status() | |
result = response.json() | |
return result['message'], gr.update(interactive=False) | |
except requests.RequestException as e: | |
logger.error(f"Error starting prompt tuning: {str(e)}") | |
return f"Error: {str(e)}", gr.update(interactive=True) | |
def check_prompt_tuning_status(): | |
url = f"{API_BASE_URL}/v1/prompt_tune_status" | |
try: | |
response = requests.get(url) | |
response.raise_for_status() | |
result = response.json() | |
return result['status'], "\n".join(result['logs']) | |
except requests.RequestException as e: | |
logger.error(f"Error checking prompt tuning status: {str(e)}") | |
return "Error", f"Failed to check prompt tuning status: {str(e)}" | |
def update_model_params(model_name): | |
params = get_model_params(model_name) | |
return gr.update(value=json.dumps(params, indent=2)) | |
########################### | |
css = """ | |
html, body { | |
margin: 0; | |
padding: 0; | |
height: 100vh; | |
overflow: hidden; | |
} | |
.gradio-container { | |
margin: 0 !important; | |
padding: 0 !important; | |
width: 100vw !important; | |
max-width: 100vw !important; | |
height: 100vh !important; | |
max-height: 100vh !important; | |
overflow: auto; | |
display: flex; | |
flex-direction: column; | |
} | |
#main-container { | |
flex: 1; | |
display: flex; | |
overflow: hidden; | |
} | |
#left-column, #right-column { | |
height: 100%; | |
overflow-y: auto; | |
padding: 10px; | |
} | |
#left-column { | |
flex: 1; | |
} | |
#right-column { | |
flex: 2; | |
display: flex; | |
flex-direction: column; | |
} | |
#chat-container { | |
flex: 0 0 auto; /* Don't allow this to grow */ | |
height: 100%; | |
display: flex; | |
flex-direction: column; | |
overflow: hidden; | |
border: 1px solid var(--color-accent); | |
border-radius: 8px; | |
padding: 10px; | |
overflow-y: auto; | |
} | |
#chatbot { | |
overflow-y: hidden; | |
height: 100%; | |
} | |
#chat-input-row { | |
margin-top: 10px; | |
} | |
#visualization-plot { | |
width: 100%; | |
aspect-ratio: 1 / 1; | |
max-height: 600px; /* Adjust this value as needed */ | |
} | |
#vis-controls-row { | |
display: flex; | |
justify-content: space-between; | |
align-items: center; | |
margin-top: 10px; | |
} | |
#vis-controls-row > * { | |
flex: 1; | |
margin: 0 5px; | |
} | |
#vis-status { | |
margin-top: 10px; | |
} | |
/* Chat input styling */ | |
#chat-input-row { | |
display: flex; | |
flex-direction: column; | |
} | |
#chat-input-row > div { | |
width: 100% !important; | |
} | |
#chat-input-row input[type="text"] { | |
width: 100% !important; | |
} | |
/* Adjust padding for all containers */ | |
.gr-box, .gr-form, .gr-panel { | |
padding: 10px !important; | |
} | |
/* Ensure all textboxes and textareas have full height */ | |
.gr-textbox, .gr-textarea { | |
height: auto !important; | |
min-height: 100px !important; | |
} | |
/* Ensure all dropdowns have full width */ | |
.gr-dropdown { | |
width: 100% !important; | |
} | |
:root { | |
--color-background: #2C3639; | |
--color-foreground: #3F4E4F; | |
--color-accent: #A27B5C; | |
--color-text: #DCD7C9; | |
} | |
body, .gradio-container { | |
background-color: var(--color-background); | |
color: var(--color-text); | |
} | |
.gr-button { | |
background-color: var(--color-accent); | |
color: var(--color-text); | |
} | |
.gr-input, .gr-textarea, .gr-dropdown { | |
background-color: var(--color-foreground); | |
color: var(--color-text); | |
border: 1px solid var(--color-accent); | |
} | |
.gr-panel { | |
background-color: var(--color-foreground); | |
border: 1px solid var(--color-accent); | |
} | |
.gr-box { | |
border-radius: 8px; | |
margin-bottom: 10px; | |
background-color: var(--color-foreground); | |
} | |
.gr-padded { | |
padding: 10px; | |
} | |
.gr-form { | |
background-color: var(--color-foreground); | |
} | |
.gr-input-label, .gr-radio-label { | |
color: var(--color-text); | |
} | |
.gr-checkbox-label { | |
color: var(--color-text); | |
} | |
.gr-markdown { | |
color: var(--color-text); | |
} | |
.gr-accordion { | |
background-color: var(--color-foreground); | |
border: 1px solid var(--color-accent); | |
} | |
.gr-accordion-header { | |
background-color: var(--color-accent); | |
color: var(--color-text); | |
} | |
#visualization-container { | |
display: flex; | |
flex-direction: column; | |
border: 2px solid var(--color-accent); | |
border-radius: 8px; | |
margin-top: 20px; | |
padding: 10px; | |
background-color: var(--color-foreground); | |
height: calc(100vh - 300px); /* Adjust this value as needed */ | |
} | |
#visualization-plot { | |
width: 100%; | |
height: 100%; | |
} | |
#vis-controls-row { | |
display: flex; | |
justify-content: space-between; | |
align-items: center; | |
margin-top: 10px; | |
} | |
#vis-controls-row > * { | |
flex: 1; | |
margin: 0 5px; | |
} | |
#vis-status { | |
margin-top: 10px; | |
} | |
#log-container { | |
background-color: var(--color-foreground); | |
border: 1px solid var(--color-accent); | |
border-radius: 8px; | |
padding: 10px; | |
margin-top: 20px; | |
max-height: auto; | |
overflow-y: auto; | |
} | |
.setting-accordion .label-wrap { | |
cursor: pointer; | |
} | |
.setting-accordion .icon { | |
transition: transform 0.3s ease; | |
} | |
.setting-accordion[open] .icon { | |
transform: rotate(90deg); | |
} | |
.gr-form.gr-box { | |
border: none !important; | |
background: none !important; | |
} | |
.model-params { | |
border-top: 1px solid var(--color-accent); | |
margin-top: 10px; | |
padding-top: 10px; | |
} | |
""" | |
def create_interface(): | |
settings = load_settings() | |
llm_api_base = normalize_api_base(settings['api_base']) | |
embeddings_api_base = normalize_api_base(settings['embeddings_api_base']) | |
with gr.Blocks(theme=gr.themes.Base(), css=css) as demo: | |
gr.Markdown("# GraphRAG Indexer") | |
with gr.Tabs(): | |
with gr.TabItem("Indexing"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## Indexing Configuration") | |
with gr.Row(): | |
llm_name = gr.Dropdown(label="LLM Model", choices=[], value=settings['llm_model'], allow_custom_value=True) | |
refresh_llm_btn = gr.Button("🔄", size='sm', scale=0) | |
with gr.Row(): | |
embed_name = gr.Dropdown(label="Embedding Model", choices=[], value=settings['embedding_model'], allow_custom_value=True) | |
refresh_embed_btn = gr.Button("🔄", size='sm', scale=0) | |
save_config_button = gr.Button("Save Configuration", variant="primary") | |
config_status = gr.Textbox(label="Configuration Status", lines=2) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
root_dir = gr.Textbox(label="Root Directory (Edit in .env file)", value=f"{ROOT_DIR}") | |
with gr.Group(): | |
verbose = gr.Checkbox(label="Verbose", interactive=True, value=True) | |
nocache = gr.Checkbox(label="No Cache", interactive=True, value=True) | |
with gr.Accordion("Advanced Options", open=True): | |
resume = gr.Textbox(label="Resume Timestamp (optional)") | |
reporter = gr.Dropdown( | |
label="Reporter", | |
choices=["rich", "print", "none"], | |
value="rich", | |
interactive=True | |
) | |
emit_formats = gr.CheckboxGroup( | |
label="Emit Formats", | |
choices=["json", "csv", "parquet"], | |
value=["parquet"], | |
interactive=True | |
) | |
custom_args = gr.Textbox(label="Custom CLI Arguments", placeholder="--arg1 value1 --arg2 value2") | |
with gr.Column(scale=1): | |
gr.Markdown("## Indexing Output") | |
index_output = gr.Textbox(label="Output", lines=10) | |
index_status = gr.Textbox(label="Status", lines=2) | |
run_index_button = gr.Button("Run Indexing", variant="primary") | |
check_status_button = gr.Button("Check Indexing Status") | |
with gr.TabItem("Prompt Tuning"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown("## Prompt Tuning Configuration") | |
pt_root = gr.Textbox(label="Root Directory", value=f"{ROOT_DIR}", interactive=True) | |
pt_domain = gr.Textbox(label="Domain (optional)") | |
pt_method = gr.Dropdown( | |
label="Method", | |
choices=["random", "top", "all"], | |
value="random", | |
interactive=True | |
) | |
pt_limit = gr.Number(label="Limit", value=15, precision=0, interactive=True) | |
pt_language = gr.Textbox(label="Language (optional)") | |
pt_max_tokens = gr.Number(label="Max Tokens", value=2000, precision=0, interactive=True) | |
pt_chunk_size = gr.Number(label="Chunk Size", value=200, precision=0, interactive=True) | |
pt_no_entity_types = gr.Checkbox(label="No Entity Types", value=False) | |
pt_output_dir = gr.Textbox(label="Output Directory", value=f"{ROOT_DIR}/prompts", interactive=True) | |
save_pt_config_button = gr.Button("Save Prompt Tuning Configuration", variant="primary") | |
with gr.Column(scale=1): | |
gr.Markdown("## Prompt Tuning Output") | |
pt_output = gr.Textbox(label="Output", lines=10) | |
pt_status = gr.Textbox(label="Status", lines=10) | |
run_pt_button = gr.Button("Run Prompt Tuning", variant="primary") | |
check_pt_status_button = gr.Button("Check Prompt Tuning Status") | |
with gr.TabItem("Data Management"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
with gr.Accordion("File Upload", open=True): | |
file_upload = gr.File(label="Upload File", file_types=[".txt", ".csv", ".parquet"]) | |
upload_btn = gr.Button("Upload File", variant="primary") | |
upload_output = gr.Textbox(label="Upload Status", visible=True) | |
with gr.Accordion("File Management", open=True): | |
file_list = gr.Dropdown(label="Select File", choices=[], interactive=True) | |
refresh_btn = gr.Button("Refresh File List", variant="secondary") | |
file_content = gr.TextArea(label="File Content", lines=10) | |
with gr.Row(): | |
delete_btn = gr.Button("Delete Selected File", variant="stop") | |
save_btn = gr.Button("Save Changes", variant="primary") | |
operation_status = gr.Textbox(label="Operation Status", visible=True) | |
with gr.Column(scale=1): | |
with gr.Accordion("Output Folders", open=True): | |
output_folder_list = gr.Dropdown(label="Select Output Folder", choices=[], interactive=True) | |
refresh_output_btn = gr.Button("Refresh Output Folders", variant="secondary") | |
folder_content_list = gr.Dropdown(label="Folder Contents", choices=[], interactive=True, multiselect=False) | |
file_info = gr.Textbox(label="File Info", lines=3) | |
output_content = gr.TextArea(label="File Content", lines=10) | |
# Event handlers | |
def refresh_llm_models(): | |
models = get_local_models(llm_api_base) | |
return gr.update(choices=models) | |
def refresh_embed_models(): | |
models = get_local_models(embeddings_api_base) | |
return gr.update(choices=models) | |
refresh_llm_btn.click( | |
refresh_llm_models, | |
outputs=[llm_name] | |
) | |
refresh_embed_btn.click( | |
refresh_embed_models, | |
outputs=[embed_name] | |
) | |
# Initialize model lists on page load | |
demo.load(refresh_llm_models, outputs=[llm_name]) | |
demo.load(refresh_embed_models, outputs=[embed_name]) | |
def create_indexing_request(): | |
return IndexingRequest( | |
llm_model=llm_name.value, | |
embed_model=embed_name.value, | |
llm_api_base=llm_api_base, | |
embed_api_base=embeddings_api_base, | |
root=root_dir.value, | |
verbose=verbose.value, | |
nocache=nocache.value, | |
resume=resume.value if resume.value else None, | |
reporter=reporter.value, | |
emit=[fmt for fmt in emit_formats.value], | |
custom_args=custom_args.value if custom_args.value else None | |
) | |
run_index_button.click( | |
lambda: start_indexing(create_indexing_request()), | |
outputs=[index_output, run_index_button, check_status_button] | |
) | |
check_status_button.click( | |
check_indexing_status, | |
outputs=[index_status, index_output] | |
) | |
def create_prompt_tune_request(): | |
return PromptTuneRequest( | |
root=pt_root.value, | |
domain=pt_domain.value if pt_domain.value else None, | |
method=pt_method.value, | |
limit=int(pt_limit.value), | |
language=pt_language.value if pt_language.value else None, | |
max_tokens=int(pt_max_tokens.value), | |
chunk_size=int(pt_chunk_size.value), | |
no_entity_types=pt_no_entity_types.value, | |
output=pt_output_dir.value | |
) | |
def update_pt_output(request): | |
result, button_update = start_prompt_tuning(request) | |
return result, button_update, gr.update(value=f"Request: {request.dict()}") | |
run_pt_button.click( | |
lambda: update_pt_output(create_prompt_tune_request()), | |
outputs=[pt_output, run_pt_button, pt_status] | |
) | |
check_pt_status_button.click( | |
check_prompt_tuning_status, | |
outputs=[pt_status, pt_output] | |
) | |
# Add event handlers for real-time updates | |
pt_root.change(lambda x: gr.update(value=f"Root Directory changed to: {x}"), inputs=[pt_root], outputs=[pt_status]) | |
pt_limit.change(lambda x: gr.update(value=f"Limit changed to: {x}"), inputs=[pt_limit], outputs=[pt_status]) | |
pt_max_tokens.change(lambda x: gr.update(value=f"Max Tokens changed to: {x}"), inputs=[pt_max_tokens], outputs=[pt_status]) | |
pt_chunk_size.change(lambda x: gr.update(value=f"Chunk Size changed to: {x}"), inputs=[pt_chunk_size], outputs=[pt_status]) | |
pt_output_dir.change(lambda x: gr.update(value=f"Output Directory changed to: {x}"), inputs=[pt_output_dir], outputs=[pt_status]) | |
# Event handlers for Data Management | |
upload_btn.click( | |
upload_file, | |
inputs=[file_upload], | |
outputs=[upload_output, file_list, operation_status] | |
) | |
refresh_btn.click( | |
update_file_list, | |
outputs=[file_list] | |
) | |
refresh_output_btn.click( | |
update_output_folder_list, | |
outputs=[output_folder_list] | |
) | |
file_list.change( | |
update_file_content, | |
inputs=[file_list], | |
outputs=[file_content] | |
) | |
delete_btn.click( | |
delete_file, | |
inputs=[file_list], | |
outputs=[operation_status, file_list, operation_status] | |
) | |
save_btn.click( | |
save_file_content, | |
inputs=[file_list, file_content], | |
outputs=[operation_status, operation_status] | |
) | |
output_folder_list.change( | |
update_folder_content_list, | |
inputs=[output_folder_list], | |
outputs=[folder_content_list] | |
) | |
folder_content_list.change( | |
handle_content_selection, | |
inputs=[output_folder_list, folder_content_list], | |
outputs=[folder_content_list, file_info, output_content] | |
) | |
# Event handler for saving configuration | |
save_config_button.click( | |
update_env_file, | |
inputs=[llm_name, embed_name], | |
outputs=[config_status] | |
) | |
# Event handler for saving prompt tuning configuration | |
save_pt_config_button.click( | |
save_prompt_tuning_config, | |
inputs=[pt_root, pt_domain, pt_method, pt_limit, pt_language, pt_max_tokens, pt_chunk_size, pt_no_entity_types, pt_output_dir], | |
outputs=[pt_status] | |
) | |
# Initialize file list and output folder list | |
demo.load(update_file_list, outputs=[file_list]) | |
demo.load(update_output_folder_list, outputs=[output_folder_list]) | |
return demo | |
def update_env_file(llm_model, embed_model): | |
env_path = os.path.join(ROOT_DIR, '.env') | |
set_key(env_path, 'LLM_MODEL', llm_model) | |
set_key(env_path, 'EMBEDDINGS_MODEL', embed_model) | |
# Reload the environment variables | |
load_dotenv(env_path, override=True) | |
return f"Environment updated: LLM_MODEL={llm_model}, EMBEDDINGS_MODEL={embed_model}" | |
def save_prompt_tuning_config(root, domain, method, limit, language, max_tokens, chunk_size, no_entity_types, output_dir): | |
config = { | |
'prompt_tuning': { | |
'root': root, | |
'domain': domain, | |
'method': method, | |
'limit': limit, | |
'language': language, | |
'max_tokens': max_tokens, | |
'chunk_size': chunk_size, | |
'no_entity_types': no_entity_types, | |
'output': output_dir | |
} | |
} | |
config_path = os.path.join(ROOT_DIR, 'prompt_tuning_config.yaml') | |
with open(config_path, 'w') as f: | |
yaml.dump(config, f) | |
return f"Prompt Tuning configuration saved to {config_path}" | |
demo = create_interface() | |
if __name__ == "__main__": | |
demo.launch(server_port=7861) | |