Shreyas94 commited on
Commit
3f6ed4f
·
verified ·
1 Parent(s): e62d662

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -63
app.py CHANGED
@@ -1,71 +1,52 @@
1
- from googlesearch import search
 
2
  from bs4 import BeautifulSoup
3
  import requests
4
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
 
6
- # Initialize Phi-3-vision-128k-instruct tokenizer and model
7
- tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-vision-128k-instruct')
8
- model = AutoModelForSeq2SeqLM.from_pretrained('microsoft/Phi-3-vision-128k-instruct', trust_remote_code=True)
9
 
10
- # Function to perform Google search and retrieve URLs, filtering by domain
11
- def google_search(query: str, num_results: int = 10):
12
- """Perform a Google search and retrieve the URLs of the search results."""
13
- search_results = []
14
- try:
15
- for url in search(query, num_results=num_results, domains=["tesla.com", "cnbc.com", "reuters.com", "bloomberg.com", "investopedia.com"]):
16
- search_results.append(url)
17
- except TypeError:
18
- for url in search(query, num_results=num_results):
19
- if any(domain in url for domain in ["tesla.com", "cnbc.com", "reuters.com", "bloomberg.com", "investopedia.com"]):
20
- search_results.append(url)
21
- return search_results
22
 
23
- # Function to fetch content from a URL and summarize it
24
- def fetch_and_summarize_url(url: str):
25
- try:
26
- response = requests.get(url)
27
- response.raise_for_status()
28
- soup = BeautifulSoup(response.text, 'html.parser')
29
-
30
- # Extract relevant content (e.g., paragraphs or sections)
31
- paragraphs = [p.text for p in soup.find_all('p')]
32
- combined_text = " ".join(paragraphs[:3]) # Combine first few paragraphs for summary
33
-
34
- # Tokenize the text
35
- inputs = tokenizer.encode("summarize: " + combined_text, return_tensors="pt", max_length=1024, truncation=True)
36
-
37
- # Generate summary
38
- summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
39
- summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
40
-
41
- return summary
42
- except requests.RequestException as e:
43
- return None
44
 
45
- # Function to perform Google search and aggregate summaries
46
- def google_search_and_answer(question: str, keywords: str):
47
- search_query = f"{question} {keywords}"
48
- search_results = google_search(search_query)
49
- summaries = []
50
- for url in search_results:
51
- fetched_summary = fetch_and_summarize_url(url)
52
- if fetched_summary:
53
- # Add additional logic to filter summaries based on relevance
54
- # Example: Check if either question or keywords are present in fetched_summary
55
- if question.lower() in fetched_summary.lower() or keywords.lower() in fetched_summary.lower():
56
- summaries.append(fetched_summary)
57
- if summaries:
58
- return "\n\n".join(summaries)
59
- else:
60
- return "No relevant information found."
61
 
62
- # Main function to run the script
63
- def main():
64
- print("Intelligent Assistant")
65
- question = input("Enter your query: ")
66
- keywords = input("Enter specific keywords (e.g., 'Q1 2024 financial results Tesla'): ")
67
- answer = google_search_and_answer(question, keywords)
68
- print("Answer:", answer)
69
 
70
- if __name__ == "__main__":
71
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
  from bs4 import BeautifulSoup
4
  import requests
 
5
 
6
+ def summarize_blog_post(url):
7
+ # Load summarization pipeline
8
+ summarizer = pipeline("summarization")
9
 
10
+ # Get blog post content
11
+ r = requests.get(url)
12
+ soup = BeautifulSoup(r.text, 'html.parser')
13
+ results = soup.find_all(['h1', 'p'])
14
+ text = [result.text for result in results]
15
+ ARTICLE = ' '.join(text)
 
 
 
 
 
 
16
 
17
+ # Chunk text
18
+ max_chunk = 500
19
+ ARTICLE = ARTICLE.replace('.', '.<eos>')
20
+ ARTICLE = ARTICLE.replace('?', '?<eos>')
21
+ ARTICLE = ARTICLE.replace('!', '!<eos>')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ sentences = ARTICLE.split('<eos>')
24
+ current_chunk = 0
25
+ chunks = []
26
+ for sentence in sentences:
27
+ if len(chunks) == current_chunk + 1:
28
+ if len(chunks[current_chunk]) + len(sentence.split(' ')) <= max_chunk:
29
+ chunks[current_chunk].extend(sentence.split(' '))
30
+ else:
31
+ current_chunk += 1
32
+ chunks.append(sentence.split(' '))
33
+ else:
34
+ chunks.append(sentence.split(' '))
 
 
 
 
35
 
36
+ for chunk_id in range(len(chunks)):
37
+ chunks[chunk_id] = ' '.join(chunks[chunk_id])
 
 
 
 
 
38
 
39
+ # Summarize text
40
+ summaries = summarizer(chunks, max_length=120, min_length=30, do_sample=False)
41
+ summary_text = " ".join([summary['summary_text'] for summary in summaries])
42
+ return summary_text
43
+
44
+ iface = gr.Interface(
45
+ fn=summarize_blog_post,
46
+ inputs="text",
47
+ outputs="text",
48
+ title="Medium Blog Post Summarizer",
49
+ description="Enter the URL of a Medium blog post to get a summarized version of the content."
50
+ )
51
+
52
+ iface.launch()