import gradio as gr import spaces import subprocess import os import shutil import string import random import glob from pypdf import PdfReader from sentence_transformers import SentenceTransformer # Configurações do modelo MODEL_NAME = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m") CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", 128)) DEFAULT_MAX_CHARACTERS = int(os.environ.get("DEFAULT_MAX_CHARACTERS", 258)) # Carregue o modelo de linguagem model = SentenceTransformer(MODEL_NAME) # Função para incorporar consultas e documentos @spaces.GPU def embed(queries, chunks): query_embeddings = model.encode(queries, prompt_name="query") document_embeddings = model.encode(chunks) scores = query_embeddings @ document_embeddings.T results = {} for query, query_scores in zip(queries, scores): chunk_idxs = [i for i in range(len(chunks))] results[query] = list(zip(chunk_idxs, query_scores)) return results # Função para extrair texto de arquivos PDF def extract_text_from_pdf(reader): full_text = "" for idx, page in enumerate(reader.pages): text = page.extract_text() if len(text) > 0: full_text += f"---- Página {idx} ----\n" + page.extract_text() + "\n\n" return full_text.strip() # Função para converter arquivos em texto def convert(filename): plain_text_filetypes = [ ".txt", ".csv", ".tsv", ".md", ".yaml", ".toml", ".json", ".json5", ".jsonc", ] if any(filename.endswith(ft) for ft in plain_text_filetypes): with open(filename, "r") as f: return f.read() if filename.endswith(".pdf"): return extract_text_from_pdf(PdfReader(filename)) raise ValueError(f"Tipo de arquivo não suportado: {filename}") # Função para dividir texto em pedaços def chunk_to_length(text, max_length=512): chunks = [] while len(text) > max_length: chunks.append(text[:max_length]) text = text[max_length:] chunks.append(text) return chunks # Função para prever pedaços relevantes @spaces.GPU def predict(query, max_characters): query_embedding = model.encode(query, prompt_name="query") all_chunks = [] for filename, doc in docs.items(): similarities = doc["embeddings"] @ query_embedding.T all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)]) all_chunks.sort(key=lambda x: x[2], reverse=True) relevant_chunks = {} total_chars = 0 for filename, chunk, _ in all_chunks: if total_chars + len(chunk) <= max_characters: if filename not in relevant_chunks: relevant_chunks[filename] = [] relevant_chunks[filename].append(chunk) total_chars += len(chunk) else: break return {"relevant_chunks": relevant_chunks} # Carregue os documentos docs = {} for filename in glob.glob("src/*"): if filename.endswith("add_your_files_here"): continue converted_doc = convert(filename) chunks = chunk_to_length(converted_doc, CHUNK_SIZE) embeddings = model.encode(chunks) docs[filename] = { "chunks": chunks, "embeddings": embeddings, } # Crie a interface da ferramenta gr.Interface( predict, inputs=[ gr.Textbox(label="Consulta feita sobre os documentos"), gr.Number(label="Máximo de caracteres de saída", value=DEFAULT_MAX_CHARACTERS), ], outputs=[gr.Dict(label="Pedaços relevantes")], title="Demonstração do modelo de ferramenta da comunidade ", description='''"Para usar o no HuggingChat com seus próprios documentos , comece clonando este espaço, adicione seus documentos à pasta `src` e então crie uma ferramenta comunitária com este espaço!" ,''' ).launch()