Spaces:
Sleeping
Sleeping
File size: 4,700 Bytes
d60d78b 53602c7 df45ad8 53602c7 464ecb2 d60d78b 53602c7 df45ad8 d60d78b dbd3c04 d60d78b 53602c7 df45ad8 7abc8ff d60d78b df45ad8 d60d78b 53602c7 df45ad8 53602c7 d60d78b 7abc8ff df45ad8 d60d78b 53602c7 d60d78b 53602c7 7abc8ff d60d78b 53602c7 d60d78b 53602c7 d60d78b 53602c7 d60d78b 53602c7 7abc8ff d60d78b 7abc8ff d60d78b 7abc8ff d60d78b 53602c7 d60d78b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os, re, validators, streamlit as st
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.document_loaders import PyPDFLoader
# βββββββββββββββββββββββββ STREAMLIT CONFIG ββββββββββββββββββββββββββ
st.set_page_config(page_title="LangChain Summarizer", page_icon="π¦")
st.title("π¦ LangChain: Summarize YT / Webpage / PDF")
# ββββββββββββββββββββββββββββ API KEY INPUT ββββββββββββββββββββββββββ
with st.sidebar:
st.header("API keys")
groq_api_key = st.text_input("Groq API Key", type="password")
if groq_api_key:
os.environ["GROQ_API_KEY"] = groq_api_key # for libraries
# βββββββββββββββββββββ PLACEHOLDERS / FILE & URL INPUT βββββββββββββββ
generic_url = st.text_input("Paste a YouTube / web URL here:")
uploaded_file = st.file_uploader("β¦or upload a PDF", type=["pdf"])
# ββββββββββββββββββββββββββ UTILITY FUNCTIONS ββββββββββββββββββββββββ
def get_video_id(url: str) -> str | None:
m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
return m.group(1) if m else None
SUMMARY_PROMPT = PromptTemplate(
template="Provide a concise summary (~300 words):\n\nContent:\n{text}",
input_variables=["text"],
)
def build_llm() -> ChatGroq:
"""Instantiate ChatGroq once and cache it in session_state."""
if "llm" not in st.session_state:
st.session_state.llm = ChatGroq(
model="llama3-70b-8192",
groq_api_key=os.environ["GROQ_API_KEY"],
)
return st.session_state.llm
def summarize(docs):
llm = build_llm()
chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT)
return chain({"input_documents": docs})["output_text"]
# βββββββββββββββββββββββββββββ MAIN ACTION βββββββββββββββββββββββββββ
if st.button("Summarize"):
if not groq_api_key:
st.error("Please enter your Groq API key in the sidebar.")
elif not generic_url and not uploaded_file:
st.error("Provide a URL or upload a PDF, then press Summarize.")
else:
try:
with st.spinner("Fetching and summarizingβ¦"):
# ---------- PDF ----------
if uploaded_file:
tmp_path = f"/tmp/{uploaded_file.name}"
with open(tmp_path, "wb") as f:
f.write(uploaded_file.read())
docs = PyPDFLoader(tmp_path).load()
st.success(summarize(docs))
os.remove(tmp_path)
# ---------- YouTube ----------
elif "youtube" in generic_url or "youtu.be" in generic_url:
vid = get_video_id(generic_url)
if not vid:
st.error("Couldnβt extract a YouTube video ID π€")
else:
transcript = YouTubeTranscriptApi.get_transcript(vid)
text = " ".join(t["text"] for t in transcript)
st.success(summarize([Document(page_content=text)]))
# ---------- Plain Webpage ----------
else:
if not validators.url(generic_url):
st.error("That doesnβt look like a valid URL.")
else:
docs = UnstructuredURLLoader(
urls=[generic_url],
ssl_verify=False,
headers={
"User-Agent":
"Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/121.0 Safari/537.36"
},
).load()
st.success(summarize(docs))
except (TranscriptsDisabled, VideoUnavailable) as yt_err:
st.error(str(yt_err))
except Exception as e:
st.exception(e)
|