Spaces:
Sleeping
Sleeping
import os, re, validators, streamlit as st | |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.summarize import load_summarize_chain | |
from langchain_groq import ChatGroq | |
from langchain.schema import Document | |
from langchain_community.document_loaders import UnstructuredURLLoader | |
from langchain.document_loaders import PyPDFLoader | |
# βββββββββββββββββββββββββ STREAMLIT CONFIG ββββββββββββββββββββββββββ | |
st.set_page_config(page_title="LangChain Summarizer", page_icon="π¦") | |
st.title("π¦ LangChain: Summarize YT / Webpage / PDF") | |
# ββββββββββββββββββββββββββββ API KEY INPUT ββββββββββββββββββββββββββ | |
with st.sidebar: | |
st.header("API keys") | |
groq_api_key = st.text_input("Groq API Key", type="password") | |
if groq_api_key: | |
os.environ["GROQ_API_KEY"] = groq_api_key # for libraries | |
# βββββββββββββββββββββ PLACEHOLDERS / FILE & URL INPUT βββββββββββββββ | |
generic_url = st.text_input("Paste a YouTube / web URL here:") | |
uploaded_file = st.file_uploader("β¦or upload a PDF", type=["pdf"]) | |
# ββββββββββββββββββββββββββ UTILITY FUNCTIONS ββββββββββββββββββββββββ | |
def get_video_id(url: str) -> str | None: | |
m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url) | |
return m.group(1) if m else None | |
SUMMARY_PROMPT = PromptTemplate( | |
template="Provide a concise summary (~300 words):\n\nContent:\n{text}", | |
input_variables=["text"], | |
) | |
def build_llm() -> ChatGroq: | |
"""Instantiate ChatGroq once and cache it in session_state.""" | |
if "llm" not in st.session_state: | |
st.session_state.llm = ChatGroq( | |
model="llama3-70b-8192", | |
groq_api_key=os.environ["GROQ_API_KEY"], | |
) | |
return st.session_state.llm | |
def summarize(docs): | |
llm = build_llm() | |
chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT) | |
return chain({"input_documents": docs})["output_text"] | |
# βββββββββββββββββββββββββββββ MAIN ACTION βββββββββββββββββββββββββββ | |
if st.button("Summarize"): | |
if not groq_api_key: | |
st.error("Please enter your Groq API key in the sidebar.") | |
elif not generic_url and not uploaded_file: | |
st.error("Provide a URL or upload a PDF, then press Summarize.") | |
else: | |
try: | |
with st.spinner("Fetching and summarizingβ¦"): | |
# ---------- PDF ---------- | |
if uploaded_file: | |
tmp_path = f"/tmp/{uploaded_file.name}" | |
with open(tmp_path, "wb") as f: | |
f.write(uploaded_file.read()) | |
docs = PyPDFLoader(tmp_path).load() | |
st.success(summarize(docs)) | |
os.remove(tmp_path) | |
# ---------- YouTube ---------- | |
elif "youtube" in generic_url or "youtu.be" in generic_url: | |
vid = get_video_id(generic_url) | |
if not vid: | |
st.error("Couldnβt extract a YouTube video ID π€") | |
else: | |
transcript = YouTubeTranscriptApi.get_transcript(vid) | |
text = " ".join(t["text"] for t in transcript) | |
st.success(summarize([Document(page_content=text)])) | |
# ---------- Plain Webpage ---------- | |
else: | |
if not validators.url(generic_url): | |
st.error("That doesnβt look like a valid URL.") | |
else: | |
docs = UnstructuredURLLoader( | |
urls=[generic_url], | |
ssl_verify=False, | |
headers={ | |
"User-Agent": | |
"Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 " | |
"(KHTML, like Gecko) Chrome/121.0 Safari/537.36" | |
}, | |
).load() | |
st.success(summarize(docs)) | |
except (TranscriptsDisabled, VideoUnavailable) as yt_err: | |
st.error(str(yt_err)) | |
except Exception as e: | |
st.exception(e) | |