joinfv commited on
Commit
0a802e6
·
verified ·
1 Parent(s): 5ca9727

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import spaces
3
+ import subprocess
4
+ import os
5
+ import shutil
6
+ import string
7
+ import random
8
+ import glob
9
+ from pypdf import PdfReader
10
+ from sentence_transformers import SentenceTransformer
11
+
12
+ # Configurações do modelo
13
+ MODEL_NAME = os.environ.get("MODEL", "Snowflake/snowflake-arctic-embed-m")
14
+ CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", 128))
15
+ DEFAULT_MAX_CHARACTERS = int(os.environ.get("DEFAULT_MAX_CHARACTERS", 258))
16
+
17
+ # Carregue o modelo de linguagem
18
+ model = SentenceTransformer(MODEL_NAME)
19
+
20
+ # Função para incorporar consultas e documentos
21
+ @spaces.GPU
22
+ def embed(queries, chunks):
23
+ query_embeddings = model.encode(queries, prompt_name="query")
24
+ document_embeddings = model.encode(chunks)
25
+
26
+ scores = query_embeddings @ document_embeddings.T
27
+ results = {}
28
+ for query, query_scores in zip(queries, scores):
29
+ chunk_idxs = [i for i in range(len(chunks))]
30
+ results[query] = list(zip(chunk_idxs, query_scores))
31
+
32
+ return results
33
+
34
+ # Função para extrair texto de arquivos PDF
35
+ def extract_text_from_pdf(reader):
36
+ full_text = ""
37
+ for idx, page in enumerate(reader.pages):
38
+ text = page.extract_text()
39
+ if len(text) > 0:
40
+ full_text += f"---- Página {idx} ----\n" + page.extract_text() + "\n\n"
41
+
42
+ return full_text.strip()
43
+
44
+ # Função para converter arquivos em texto
45
+ def convert(filename):
46
+ plain_text_filetypes = [
47
+ ".txt",
48
+ ".csv",
49
+ ".tsv",
50
+ ".md",
51
+ ".yaml",
52
+ ".toml",
53
+ ".json",
54
+ ".json5",
55
+ ".jsonc",
56
+ ]
57
+
58
+ if any(filename.endswith(ft) for ft in plain_text_filetypes):
59
+ with open(filename, "r") as f:
60
+ return f.read()
61
+
62
+ if filename.endswith(".pdf"):
63
+ return extract_text_from_pdf(PdfReader(filename))
64
+
65
+ raise ValueError(f"Tipo de arquivo não suportado: {filename}")
66
+
67
+ # Função para dividir texto em pedaços
68
+ def chunk_to_length(text, max_length=512):
69
+ chunks = []
70
+ while len(text) > max_length:
71
+ chunks.append(text[:max_length])
72
+ text = text[max_length:]
73
+ chunks.append(text)
74
+ return chunks
75
+
76
+ # Função para prever pedaços relevantes
77
+ @spaces.GPU
78
+ def predict(query, max_characters):
79
+ query_embedding = model.encode(query, prompt_name="query")
80
+
81
+ all_chunks = []
82
+ for filename, doc in docs.items():
83
+ similarities = doc["embeddings"] @ query_embedding.T
84
+ all_chunks.extend([(filename, chunk, sim) for chunk, sim in zip(doc["chunks"], similarities)])
85
+
86
+ all_chunks.sort(key=lambda x: x[2], reverse=True)
87
+
88
+ relevant_chunks = {}
89
+ total_chars = 0
90
+ for filename, chunk, _ in all_chunks:
91
+ if total_chars + len(chunk) <= max_characters:
92
+ if filename not in relevant_chunks:
93
+ relevant_chunks[filename] = []
94
+ relevant_chunks[filename].append(chunk)
95
+ total_chars += len(chunk)
96
+ else:
97
+ break
98
+
99
+ return {"relevant_chunks": relevant_chunks}
100
+
101
+ # Carregue os documentos
102
+ docs = {}
103
+ for filename in glob.glob("src/*"):
104
+ if filename.endswith("add_your_files_here"):
105
+ continue
106
+
107
+ converted_doc = convert(filename)
108
+ chunks = chunk_to_length(converted_doc, CHUNK_SIZE)
109
+ embeddings = model.encode(chunks)
110
+
111
+ docs[filename] = {
112
+ "chunks": chunks,
113
+ "embeddings": embeddings,
114
+ }
115
+
116
+ # Crie a interface da ferramenta
117
+ gr.Interface(
118
+ predict,
119
+ inputs=[
120
+ gr.Textbox(label="Consulta feita sobre os documentos"),
121
+ gr.Number(label="Máximo de caracteres de saída", value=DEFAULT_MAX_CHARACTERS),
122
+ ],
123
+ outputs=[gr.Dict(label="Pedaços relevantes")],
124
+ title="Demonstração do modelo de ferramenta da comunidade ",
125
+ description='''"Para usar o no HuggingChat com seus próprios documentos
126
+ , comece clonando este espaço, adicione seus documentos à pasta `src` e então crie uma ferramenta comunitária com este espaço!"
127
+ ,'''
128
+ ).launch()