import os, re, validators, streamlit as st from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable from langchain.prompts import PromptTemplate from langchain.chains.summarize import load_summarize_chain from langchain_groq import ChatGroq from langchain.schema import Document from langchain_community.document_loaders import UnstructuredURLLoader from langchain.document_loaders import PyPDFLoader # ───────────────────────── STREAMLIT CONFIG ────────────────────────── st.set_page_config(page_title="LangChain Summarizer", page_icon="🦜") st.title("🦜 LangChain: Summarize YT / Webpage / PDF") # ──────────────────────────── API KEY INPUT ────────────────────────── with st.sidebar: st.header("API keys") groq_api_key = st.text_input("Groq API Key", type="password") if groq_api_key: os.environ["GROQ_API_KEY"] = groq_api_key # for libraries # ───────────────────── PLACEHOLDERS / FILE & URL INPUT ─────────────── generic_url = st.text_input("Paste a YouTube / web URL here:") uploaded_file = st.file_uploader("…or upload a PDF", type=["pdf"]) # ────────────────────────── UTILITY FUNCTIONS ──────────────────────── def get_video_id(url: str) -> str | None: m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url) return m.group(1) if m else None SUMMARY_PROMPT = PromptTemplate( template="Provide a concise summary (~300 words):\n\nContent:\n{text}", input_variables=["text"], ) def build_llm() -> ChatGroq: """Instantiate ChatGroq once and cache it in session_state.""" if "llm" not in st.session_state: st.session_state.llm = ChatGroq( model="llama3-70b-8192", groq_api_key=os.environ["GROQ_API_KEY"], ) return st.session_state.llm def summarize(docs): llm = build_llm() chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT) return chain({"input_documents": docs})["output_text"] # ───────────────────────────── MAIN ACTION ─────────────────────────── if st.button("Summarize"): if not groq_api_key: st.error("Please enter your Groq API key in the sidebar.") elif not generic_url and not uploaded_file: st.error("Provide a URL or upload a PDF, then press Summarize.") else: try: with st.spinner("Fetching and summarizing…"): # ---------- PDF ---------- if uploaded_file: tmp_path = f"/tmp/{uploaded_file.name}" with open(tmp_path, "wb") as f: f.write(uploaded_file.read()) docs = PyPDFLoader(tmp_path).load() st.success(summarize(docs)) os.remove(tmp_path) # ---------- YouTube ---------- elif "youtube" in generic_url or "youtu.be" in generic_url: vid = get_video_id(generic_url) if not vid: st.error("Couldn’t extract a YouTube video ID 🤔") else: transcript = YouTubeTranscriptApi.get_transcript(vid) text = " ".join(t["text"] for t in transcript) st.success(summarize([Document(page_content=text)])) # ---------- Plain Webpage ---------- else: if not validators.url(generic_url): st.error("That doesn’t look like a valid URL.") else: docs = UnstructuredURLLoader( urls=[generic_url], ssl_verify=False, headers={ "User-Agent": "Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/121.0 Safari/537.36" }, ).load() st.success(summarize(docs)) except (TranscriptsDisabled, VideoUnavailable) as yt_err: st.error(str(yt_err)) except Exception as e: st.exception(e)