File size: 4,700 Bytes
d60d78b
53602c7
 
 
df45ad8
53602c7
 
 
464ecb2
d60d78b
53602c7
 
df45ad8
d60d78b
 
 
 
 
 
 
 
 
 
dbd3c04
d60d78b
 
53602c7
 
df45ad8
7abc8ff
 
 
 
 
d60d78b
 
df45ad8
d60d78b
 
 
 
53602c7
df45ad8
53602c7
d60d78b
7abc8ff
 
df45ad8
d60d78b
53602c7
d60d78b
 
 
 
53602c7
 
7abc8ff
d60d78b
 
 
 
 
 
 
 
 
 
 
 
 
53602c7
d60d78b
53602c7
 
 
d60d78b
 
 
53602c7
d60d78b
 
53602c7
7abc8ff
d60d78b
7abc8ff
d60d78b
 
 
 
 
7abc8ff
d60d78b
 
53602c7
 
 
d60d78b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os, re, validators, streamlit as st
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import PyPDFLoader

# ─────────────────────────  STREAMLIT CONFIG  ──────────────────────────
st.set_page_config(page_title="LangChain Summarizer", page_icon="🦜")
st.title("🦜 LangChain: Summarize YT / Webpage / PDF")

# ────────────────────────────  API KEY INPUT  ──────────────────────────
with st.sidebar:
    st.header("API keys")
    groq_api_key = st.text_input("Groq API Key", type="password")
    if groq_api_key:
        os.environ["GROQ_API_KEY"] = groq_api_key          # for libraries

# ─────────────────────  PLACEHOLDERS / FILE & URL INPUT  ───────────────
generic_url   = st.text_input("Paste a YouTube / web URL here:")
uploaded_file = st.file_uploader("…or upload a PDF", type=["pdf"])

# ──────────────────────────  UTILITY FUNCTIONS  ────────────────────────
def get_video_id(url: str) -> str | None:
    m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
    return m.group(1) if m else None

SUMMARY_PROMPT = PromptTemplate(
    template="Provide a concise summary (~300 words):\n\nContent:\n{text}",
    input_variables=["text"],
)

def build_llm() -> ChatGroq:
    """Instantiate ChatGroq once and cache it in session_state."""
    if "llm" not in st.session_state:
        st.session_state.llm = ChatGroq(
            model="llama3-70b-8192",
            groq_api_key=os.environ["GROQ_API_KEY"],
        )
    return st.session_state.llm

def summarize(docs):
    llm   = build_llm()
    chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT)
    return chain({"input_documents": docs})["output_text"]

# ─────────────────────────────  MAIN ACTION  ───────────────────────────
if st.button("Summarize"):
    if not groq_api_key:
        st.error("Please enter your Groq API key in the sidebar.")
    elif not generic_url and not uploaded_file:
        st.error("Provide a URL or upload a PDF, then press Summarize.")
    else:
        try:
            with st.spinner("Fetching and summarizing…"):

                # ---------- PDF ----------
                if uploaded_file:
                    tmp_path = f"/tmp/{uploaded_file.name}"
                    with open(tmp_path, "wb") as f:
                        f.write(uploaded_file.read())
                    docs = PyPDFLoader(tmp_path).load()
                    st.success(summarize(docs))
                    os.remove(tmp_path)

                # ---------- YouTube ----------
                elif "youtube" in generic_url or "youtu.be" in generic_url:
                    vid = get_video_id(generic_url)
                    if not vid:
                        st.error("Couldn’t extract a YouTube video ID πŸ€”")
                    else:
                        transcript = YouTubeTranscriptApi.get_transcript(vid)
                        text = " ".join(t["text"] for t in transcript)
                        st.success(summarize([Document(page_content=text)]))

                # ---------- Plain Webpage ----------
                else:
                    if not validators.url(generic_url):
                        st.error("That doesn’t look like a valid URL.")
                    else:
                        docs = UnstructuredURLLoader(
                            urls=[generic_url],
                            ssl_verify=False,
                            headers={
                                "User-Agent":
                                "Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 "
                                "(KHTML, like Gecko) Chrome/121.0 Safari/537.36"
                            },
                        ).load()
                        st.success(summarize(docs))

        except (TranscriptsDisabled, VideoUnavailable) as yt_err:
            st.error(str(yt_err))
        except Exception as e:
            st.exception(e)