DrishtiSharma commited on
Commit
4faab88
Β·
verified Β·
1 Parent(s): 0096849

Create patent_downloader.py

Browse files
Files changed (1) hide show
  1. patent_downloader.py +129 -0
patent_downloader.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union, Optional
2
+ import os
3
+ import requests
4
+ import tempfile
5
+ from bs4 import BeautifulSoup
6
+
7
+
8
+ class PatentDownloader:
9
+ """
10
+ A class to automate downloading patent PDFs from Google Patents.
11
+ """
12
+ base_url = "https://patents.google.com/patent"
13
+
14
+ def __init__(self, verbose: bool = False):
15
+ """
16
+ Initialize the downloader.
17
+ Parameters
18
+ ----------
19
+ verbose : bool
20
+ If True, print detailed debug information.
21
+ """
22
+ self.verbose = verbose
23
+
24
+ def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
25
+ """
26
+ Download single or multiple patent PDFs.
27
+ Parameters
28
+ ----------
29
+ patents : str or List[str]
30
+ Single patent number or a list of patent numbers.
31
+ output_path : Optional[str]
32
+ Directory to save the PDFs. Defaults to a temporary directory.
33
+ Returns
34
+ -------
35
+ List[str]
36
+ List of paths to the downloaded PDFs.
37
+ """
38
+ if isinstance(patents, str):
39
+ patents = [patents]
40
+
41
+ # Use a temporary directory if no output path is provided
42
+ output_dir = output_path or tempfile.gettempdir()
43
+ os.makedirs(output_dir, exist_ok=True)
44
+
45
+ downloaded_files = []
46
+
47
+ for i, patent in enumerate(patents):
48
+ try:
49
+ if self.verbose:
50
+ print(f"πŸ” Downloading {i+1}/{len(patents)}: {patent}")
51
+ file_path = self._download_single_pdf(patent, output_dir)
52
+ downloaded_files.append(file_path)
53
+ print(f"βœ… Successfully downloaded: {file_path}")
54
+ except Exception as e:
55
+ print(f"❌ Failed to download {patent}: {e}")
56
+
57
+ return downloaded_files
58
+
59
+ def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
60
+ """
61
+ Download a single patent PDF.
62
+ Parameters
63
+ ----------
64
+ patent_number : str
65
+ The patent number (e.g., "US8676427B1").
66
+ output_dir : str
67
+ Directory to save the PDF.
68
+ Returns
69
+ -------
70
+ str
71
+ Path to the downloaded PDF file.
72
+ """
73
+ # Construct the Google Patents URL
74
+ patent_url = f"{self.base_url}/{patent_number}/en"
75
+
76
+ if self.verbose:
77
+ print(f"Fetching patent page: {patent_url}")
78
+
79
+ # Fetch the HTML content of the patent page
80
+ response = requests.get(patent_url)
81
+ if response.status_code != 200:
82
+ raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")
83
+
84
+ # Parse the HTML content and extract the PDF link
85
+ soup = BeautifulSoup(response.content, "html.parser")
86
+ pdf_url = self._extract_pdf_link(soup)
87
+
88
+ if not pdf_url:
89
+ raise Exception(f"No PDF link found for patent {patent_number}.")
90
+
91
+ if self.verbose:
92
+ print(f"Found PDF link: {pdf_url}")
93
+
94
+ # Download the PDF file
95
+ pdf_response = requests.get(pdf_url)
96
+ if pdf_response.status_code != 200:
97
+ raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")
98
+
99
+ # Save the PDF to the specified output directory
100
+ file_path = os.path.join(output_dir, f"{patent_number}.pdf")
101
+ with open(file_path, "wb") as pdf_file:
102
+ pdf_file.write(pdf_response.content)
103
+
104
+ return file_path
105
+
106
+ @staticmethod
107
+ def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
108
+ """
109
+ Extract the PDF link from the page's metadata.
110
+ Parameters
111
+ ----------
112
+ soup : BeautifulSoup
113
+ Parsed HTML content of the patent page.
114
+ Returns
115
+ -------
116
+ Optional[str]
117
+ The direct PDF link if found.
118
+ """
119
+ # Look for the 'citation_pdf_url' meta tag
120
+ pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
121
+ if pdf_meta and pdf_meta.get("content"):
122
+ return pdf_meta["content"]
123
+
124
+ # Fallback: search for any <a> tag containing '.pdf' in its href
125
+ pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
126
+ if pdf_links:
127
+ return pdf_links[0] # Return the first matching PDF link
128
+
129
+ return None