File size: 3,285 Bytes
70595a6
 
 
 
efbe976
70595a6
efbe976
70595a6
efbe976
70595a6
691efb9
70595a6
691efb9
efbe976
 
691efb9
70595a6
 
 
efbe976
 
 
 
 
 
691efb9
70595a6
 
 
 
691efb9
70595a6
 
 
 
691efb9
70595a6
691efb9
 
efbe976
691efb9
 
efbe976
691efb9
 
70595a6
 
 
efbe976
 
691efb9
efbe976
 
691efb9
efbe976
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import streamlit as st
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate

st.set_page_config(page_title='preguntaDOC')
st.header("Pregunta a tu PDF")

# Campo para el token de Hugging Face (ahora requerido para los embeddings)
huggingface_api_token = st.text_input('Hugging Face API Token (requerido)', type='password')

pdf_obj = st.file_uploader("Carga tu documento", type="pdf", on_change=st.cache_resource.clear)

@st.cache_resource 
def create_embeddings(pdf, api_token):
    if not api_token:
        st.error("Se requiere un token de API de Hugging Face")
        return None
        
    os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
    
    pdf_reader = PdfReader(pdf)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100,
        length_function=len
    )        
    chunks = text_splitter.split_text(text)
    
    # Usar HuggingFaceHubEmbeddings en lugar de HuggingFaceEmbeddings
    # Este enfoque no requiere sentence-transformers instalado localmente
    embeddings = HuggingFaceHubEmbeddings(
        repo_id="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        huggingfacehub_api_token=api_token
    )
    
    knowledge_base = FAISS.from_texts(chunks, embeddings)
    return knowledge_base

if pdf_obj and huggingface_api_token:
    knowledge_base = create_embeddings(pdf_obj, huggingface_api_token)
    
    if knowledge_base:
        user_question = st.text_input("Haz una pregunta sobre tu PDF:")
        
        if user_question:
            docs = knowledge_base.similarity_search(user_question, 3)
            
            # Usar un modelo gratuito de Hugging Face
            llm = HuggingFaceHub(
                repo_id="google/flan-t5-large",
                huggingfacehub_api_token=huggingface_api_token,
                model_kwargs={"temperature": 0.5, "max_length": 512}
            )
            
            prompt_template = """
            Responde a la siguiente pregunta basándote únicamente en el contexto proporcionado.
            
            Contexto: {context}
            
            Pregunta: {question}
            
            Respuesta:
            """
            
            PROMPT = PromptTemplate(
                template=prompt_template, 
                input_variables=["context", "question"]
            )
            
            chain = load_qa_chain(llm, chain_type="stuff", prompt=PROMPT)
            
            with st.spinner("Procesando tu pregunta..."):
                try:
                    respuesta = chain.run(input_documents=docs, question=user_question)
                    st.write(respuesta)
                except Exception as e:
                    st.error(f"Error al procesar tu pregunta: {str(e)}")
elif not huggingface_api_token and pdf_obj:
    st.warning("Por favor, ingresa tu token de API de Hugging Face para continuar.")