richardmfan commited on
Commit
44ce5f6
·
unverified ·
2 Parent(s): e0d30f3 ebc10b6

Merge pull request #18 from ido777/cosmetic_changes

Browse files
.env.template ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## ArxivDigest- environment seting
2
+
3
+
4
+ ##################################################################################################
5
+ # NOTE: This is an important note!
6
+ # Do not edit and commit your .env.template with your personal keys, it might reveal you API keys
7
+ # copy this file to .env and only then edit your .env file not this file
8
+ ##################################################################################################
9
+ OPENAI_API_KEY=your_api_key
10
+
11
+ ## EMAIL SETTINGS
12
+ SENDGRID_API_KEY=your_api_key
13
+ FROM_EMAIL=your_email
14
+ TO_EMAIL=your_email
.github/workflows/daily_pipeline.yaml CHANGED
@@ -23,7 +23,7 @@ jobs:
23
  - name: Install dependencies
24
  run: |
25
  python -m pip install --upgrade pip
26
- pip install -r src/requirements.txt
27
  - name: Generate Digest
28
  run: |
29
  python src/action.py
 
23
  - name: Install dependencies
24
  run: |
25
  python -m pip install --upgrade pip
26
+ pip install -r requirements.txt
27
  - name: Generate Digest
28
  run: |
29
  python src/action.py
README.md CHANGED
@@ -83,7 +83,12 @@ To locally run the same UI as the Huggign Face space:
83
 
84
  1. Install the requirements in `src/requirements.txt` as well as `gradio`.
85
  2. Run `python src/app.py` and go to the local URL. From there you will be able to preview the papers from today, as well as the generated digests.
 
 
 
 
86
 
 
87
 
88
  ## ✅ Roadmap
89
 
 
83
 
84
  1. Install the requirements in `src/requirements.txt` as well as `gradio`.
85
  2. Run `python src/app.py` and go to the local URL. From there you will be able to preview the papers from today, as well as the generated digests.
86
+ 3. If you want to run the action locally you can copy .env.template to .env and then set the environment variables in the .env file.
87
+ - This file may be hidden by default in some operating systems due to the dot prefix. To reveal hidden files, follow the instructions for your
88
+ - The .env file is one of the files in .gitignore, this means that git does not track it and it will not be uploaded to the repository, accidentally.
89
+ - For this reason you should not edit the original .env.template and put the keys or your email address in the original file, since the .template.env is tracked by git and editing it might cause you to commit it with your API keys.
90
 
91
+ > **WARNING:** This is an important note! Do not edit and commit your .env.template with your personal key, or email, it might reveal your personal data such as API keys and email.
92
 
93
  ## ✅ Roadmap
94
 
src/requirements.txt → requirements.txt RENAMED
@@ -1,7 +1,8 @@
 
1
  beautifulsoup4==4.12.2
2
- tqdm==4.65.0
 
 
3
  pytz==2023.3
4
- numpy==1.24.2
5
- openai==0.27.4
6
  sendgrid==6.10.0
7
- pyyaml==6.00
 
1
+ PyYAML==6.0
2
  beautifulsoup4==4.12.2
3
+ numpy==1.25.0
4
+ openai==0.27.8
5
+ python-dotenv==1.0.0
6
  pytz==2023.3
 
 
7
  sendgrid==6.10.0
8
+ tqdm==4.65.0
src/action.py CHANGED
@@ -6,12 +6,12 @@ from datetime import date
6
  import argparse
7
  import yaml
8
  import os
9
-
 
10
  from relevancy import generate_relevance_score, process_subject_fields
11
  from download_new_papers import get_papers
12
 
13
 
14
-
15
  # Hackathon quality code. Don't judge too harshly.
16
  # Feel free to submit pull requests to improve the code.
17
 
@@ -23,7 +23,7 @@ topics = {
23
  "Quantitative Finance": "q-fin",
24
  "Statistics": "stat",
25
  "Electrical Engineering and Systems Science": "eess",
26
- "Economics": "econ"
27
  }
28
 
29
  physics_topics = {
@@ -39,32 +39,185 @@ physics_topics = {
39
  "Nuclear Experiment": "nucl-ex",
40
  "Nuclear Theory": "nucl-th",
41
  "Physics": "physics",
42
- "Quantum Physics": "quant-ph"
43
  }
44
 
45
 
46
  # TODO: surely theres a better way
47
  category_map = {
48
- "Astrophysics": ["Astrophysics of Galaxies", "Cosmology and Nongalactic Astrophysics", "Earth and Planetary Astrophysics", "High Energy Astrophysical Phenomena", "Instrumentation and Methods for Astrophysics", "Solar and Stellar Astrophysics"],
49
- "Condensed Matter": ["Disordered Systems and Neural Networks", "Materials Science", "Mesoscale and Nanoscale Physics", "Other Condensed Matter", "Quantum Gases", "Soft Condensed Matter", "Statistical Mechanics", "Strongly Correlated Electrons", "Superconductivity"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  "General Relativity and Quantum Cosmology": ["None"],
51
  "High Energy Physics - Experiment": ["None"],
52
  "High Energy Physics - Lattice": ["None"],
53
  "High Energy Physics - Phenomenology": ["None"],
54
  "High Energy Physics - Theory": ["None"],
55
  "Mathematical Physics": ["None"],
56
- "Nonlinear Sciences": ["Adaptation and Self-Organizing Systems", "Cellular Automata and Lattice Gases", "Chaotic Dynamics", "Exactly Solvable and Integrable Systems", "Pattern Formation and Solitons"],
 
 
 
 
 
 
57
  "Nuclear Experiment": ["None"],
58
  "Nuclear Theory": ["None"],
59
- "Physics": ["Accelerator Physics", "Applied Physics", "Atmospheric and Oceanic Physics", "Atomic and Molecular Clusters", "Atomic Physics", "Biological Physics", "Chemical Physics", "Classical Physics", "Computational Physics", "Data Analysis, Statistics and Probability", "Fluid Dynamics", "General Physics", "Geophysics", "History and Philosophy of Physics", "Instrumentation and Detectors", "Medical Physics", "Optics", "Physics and Society", "Physics Education", "Plasma Physics", "Popular Physics", "Space Physics"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  "Quantum Physics": ["None"],
61
- "Mathematics": ["Algebraic Geometry", "Algebraic Topology", "Analysis of PDEs", "Category Theory", "Classical Analysis and ODEs", "Combinatorics", "Commutative Algebra", "Complex Variables", "Differential Geometry", "Dynamical Systems", "Functional Analysis", "General Mathematics", "General Topology", "Geometric Topology", "Group Theory", "History and Overview", "Information Theory", "K-Theory and Homology", "Logic", "Mathematical Physics", "Metric Geometry", "Number Theory", "Numerical Analysis", "Operator Algebras", "Optimization and Control", "Probability", "Quantum Algebra", "Representation Theory", "Rings and Algebras", "Spectral Theory", "Statistics Theory", "Symplectic Geometry"],
62
- "Computer Science": ["Artificial Intelligence", "Computation and Language", "Computational Complexity", "Computational Engineering, Finance, and Science", "Computational Geometry", "Computer Science and Game Theory", "Computer Vision and Pattern Recognition", "Computers and Society", "Cryptography and Security", "Data Structures and Algorithms", "Databases", "Digital Libraries", "Discrete Mathematics", "Distributed, Parallel, and Cluster Computing", "Emerging Technologies", "Formal Languages and Automata Theory", "General Literature", "Graphics", "Hardware Architecture", "Human-Computer Interaction", "Information Retrieval", "Information Theory", "Logic in Computer Science", "Machine Learning", "Mathematical Software", "Multiagent Systems", "Multimedia", "Networking and Internet Architecture", "Neural and Evolutionary Computing", "Numerical Analysis", "Operating Systems", "Other Computer Science", "Performance", "Programming Languages", "Robotics", "Social and Information Networks", "Software Engineering", "Sound", "Symbolic Computation", "Systems and Control"],
63
- "Quantitative Biology": ["Biomolecules", "Cell Behavior", "Genomics", "Molecular Networks", "Neurons and Cognition", "Other Quantitative Biology", "Populations and Evolution", "Quantitative Methods", "Subcellular Processes", "Tissues and Organs"],
64
- "Quantitative Finance": ["Computational Finance", "Economics", "General Finance", "Mathematical Finance", "Portfolio Management", "Pricing of Securities", "Risk Management", "Statistical Finance", "Trading and Market Microstructure"],
65
- "Statistics": ["Applications", "Computation", "Machine Learning", "Methodology", "Other Statistics", "Statistics Theory"],
66
- "Electrical Engineering and Systems Science": ["Audio and Speech Processing", "Image and Video Processing", "Signal Processing", "Systems and Control"],
67
- "Economics": ["Econometrics", "General Economics", "Theoretical Economics"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  }
69
 
70
 
@@ -83,8 +236,10 @@ def generate_body(topic, categories, interest, threshold):
83
  raise RuntimeError(f"{category} is not a category of {topic}")
84
  papers = get_papers(abbr)
85
  papers = [
86
- t for t in papers
87
- if bool(set(process_subject_fields(t['subjects'])) & set(categories))]
 
 
88
  else:
89
  papers = get_papers(abbr)
90
  if interest:
@@ -92,27 +247,43 @@ def generate_body(topic, categories, interest, threshold):
92
  papers,
93
  query={"interest": interest},
94
  threshold_score=threshold,
95
- num_paper_in_prompt=16)
 
96
  body = "<br><br>".join(
97
- [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
98
- for paper in relevancy])
 
 
 
99
  if hallucination:
100
- body = "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>" + body
 
 
 
101
  else:
102
  body = "<br><br>".join(
103
- [f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}'
104
- for paper in papers])
 
 
 
105
  return body
106
 
107
 
108
  if __name__ == "__main__":
 
 
109
  parser = argparse.ArgumentParser()
110
- parser.add_argument("--config", help="yaml config file to use", default="config.yaml")
 
 
111
  args = parser.parse_args()
112
  with open(args.config, "r") as f:
113
  config = yaml.safe_load(f)
 
114
  if "OPENAI_API_KEY" not in os.environ:
115
  raise RuntimeError("No openai api key found")
 
116
 
117
  topic = config["topic"]
118
  categories = config["categories"]
@@ -123,8 +294,8 @@ if __name__ == "__main__":
123
  body = generate_body(topic, categories, interest, threshold)
124
  with open("digest.html", "w") as f:
125
  f.write(body)
126
- if os.environ.get('SENDGRID_API_KEY', None):
127
- sg = SendGridAPIClient(api_key=os.environ.get('SENDGRID_API_KEY'))
128
  from_email = Email(from_email) # Change to your verified sender
129
  to_email = To(to_email)
130
  subject = date.today().strftime("Personalized arXiv Digest, %d %b %Y")
 
6
  import argparse
7
  import yaml
8
  import os
9
+ from dotenv import load_dotenv
10
+ import openai
11
  from relevancy import generate_relevance_score, process_subject_fields
12
  from download_new_papers import get_papers
13
 
14
 
 
15
  # Hackathon quality code. Don't judge too harshly.
16
  # Feel free to submit pull requests to improve the code.
17
 
 
23
  "Quantitative Finance": "q-fin",
24
  "Statistics": "stat",
25
  "Electrical Engineering and Systems Science": "eess",
26
+ "Economics": "econ",
27
  }
28
 
29
  physics_topics = {
 
39
  "Nuclear Experiment": "nucl-ex",
40
  "Nuclear Theory": "nucl-th",
41
  "Physics": "physics",
42
+ "Quantum Physics": "quant-ph",
43
  }
44
 
45
 
46
  # TODO: surely theres a better way
47
  category_map = {
48
+ "Astrophysics": [
49
+ "Astrophysics of Galaxies",
50
+ "Cosmology and Nongalactic Astrophysics",
51
+ "Earth and Planetary Astrophysics",
52
+ "High Energy Astrophysical Phenomena",
53
+ "Instrumentation and Methods for Astrophysics",
54
+ "Solar and Stellar Astrophysics",
55
+ ],
56
+ "Condensed Matter": [
57
+ "Disordered Systems and Neural Networks",
58
+ "Materials Science",
59
+ "Mesoscale and Nanoscale Physics",
60
+ "Other Condensed Matter",
61
+ "Quantum Gases",
62
+ "Soft Condensed Matter",
63
+ "Statistical Mechanics",
64
+ "Strongly Correlated Electrons",
65
+ "Superconductivity",
66
+ ],
67
  "General Relativity and Quantum Cosmology": ["None"],
68
  "High Energy Physics - Experiment": ["None"],
69
  "High Energy Physics - Lattice": ["None"],
70
  "High Energy Physics - Phenomenology": ["None"],
71
  "High Energy Physics - Theory": ["None"],
72
  "Mathematical Physics": ["None"],
73
+ "Nonlinear Sciences": [
74
+ "Adaptation and Self-Organizing Systems",
75
+ "Cellular Automata and Lattice Gases",
76
+ "Chaotic Dynamics",
77
+ "Exactly Solvable and Integrable Systems",
78
+ "Pattern Formation and Solitons",
79
+ ],
80
  "Nuclear Experiment": ["None"],
81
  "Nuclear Theory": ["None"],
82
+ "Physics": [
83
+ "Accelerator Physics",
84
+ "Applied Physics",
85
+ "Atmospheric and Oceanic Physics",
86
+ "Atomic and Molecular Clusters",
87
+ "Atomic Physics",
88
+ "Biological Physics",
89
+ "Chemical Physics",
90
+ "Classical Physics",
91
+ "Computational Physics",
92
+ "Data Analysis, Statistics and Probability",
93
+ "Fluid Dynamics",
94
+ "General Physics",
95
+ "Geophysics",
96
+ "History and Philosophy of Physics",
97
+ "Instrumentation and Detectors",
98
+ "Medical Physics",
99
+ "Optics",
100
+ "Physics and Society",
101
+ "Physics Education",
102
+ "Plasma Physics",
103
+ "Popular Physics",
104
+ "Space Physics",
105
+ ],
106
  "Quantum Physics": ["None"],
107
+ "Mathematics": [
108
+ "Algebraic Geometry",
109
+ "Algebraic Topology",
110
+ "Analysis of PDEs",
111
+ "Category Theory",
112
+ "Classical Analysis and ODEs",
113
+ "Combinatorics",
114
+ "Commutative Algebra",
115
+ "Complex Variables",
116
+ "Differential Geometry",
117
+ "Dynamical Systems",
118
+ "Functional Analysis",
119
+ "General Mathematics",
120
+ "General Topology",
121
+ "Geometric Topology",
122
+ "Group Theory",
123
+ "History and Overview",
124
+ "Information Theory",
125
+ "K-Theory and Homology",
126
+ "Logic",
127
+ "Mathematical Physics",
128
+ "Metric Geometry",
129
+ "Number Theory",
130
+ "Numerical Analysis",
131
+ "Operator Algebras",
132
+ "Optimization and Control",
133
+ "Probability",
134
+ "Quantum Algebra",
135
+ "Representation Theory",
136
+ "Rings and Algebras",
137
+ "Spectral Theory",
138
+ "Statistics Theory",
139
+ "Symplectic Geometry",
140
+ ],
141
+ "Computer Science": [
142
+ "Artificial Intelligence",
143
+ "Computation and Language",
144
+ "Computational Complexity",
145
+ "Computational Engineering, Finance, and Science",
146
+ "Computational Geometry",
147
+ "Computer Science and Game Theory",
148
+ "Computer Vision and Pattern Recognition",
149
+ "Computers and Society",
150
+ "Cryptography and Security",
151
+ "Data Structures and Algorithms",
152
+ "Databases",
153
+ "Digital Libraries",
154
+ "Discrete Mathematics",
155
+ "Distributed, Parallel, and Cluster Computing",
156
+ "Emerging Technologies",
157
+ "Formal Languages and Automata Theory",
158
+ "General Literature",
159
+ "Graphics",
160
+ "Hardware Architecture",
161
+ "Human-Computer Interaction",
162
+ "Information Retrieval",
163
+ "Information Theory",
164
+ "Logic in Computer Science",
165
+ "Machine Learning",
166
+ "Mathematical Software",
167
+ "Multiagent Systems",
168
+ "Multimedia",
169
+ "Networking and Internet Architecture",
170
+ "Neural and Evolutionary Computing",
171
+ "Numerical Analysis",
172
+ "Operating Systems",
173
+ "Other Computer Science",
174
+ "Performance",
175
+ "Programming Languages",
176
+ "Robotics",
177
+ "Social and Information Networks",
178
+ "Software Engineering",
179
+ "Sound",
180
+ "Symbolic Computation",
181
+ "Systems and Control",
182
+ ],
183
+ "Quantitative Biology": [
184
+ "Biomolecules",
185
+ "Cell Behavior",
186
+ "Genomics",
187
+ "Molecular Networks",
188
+ "Neurons and Cognition",
189
+ "Other Quantitative Biology",
190
+ "Populations and Evolution",
191
+ "Quantitative Methods",
192
+ "Subcellular Processes",
193
+ "Tissues and Organs",
194
+ ],
195
+ "Quantitative Finance": [
196
+ "Computational Finance",
197
+ "Economics",
198
+ "General Finance",
199
+ "Mathematical Finance",
200
+ "Portfolio Management",
201
+ "Pricing of Securities",
202
+ "Risk Management",
203
+ "Statistical Finance",
204
+ "Trading and Market Microstructure",
205
+ ],
206
+ "Statistics": [
207
+ "Applications",
208
+ "Computation",
209
+ "Machine Learning",
210
+ "Methodology",
211
+ "Other Statistics",
212
+ "Statistics Theory",
213
+ ],
214
+ "Electrical Engineering and Systems Science": [
215
+ "Audio and Speech Processing",
216
+ "Image and Video Processing",
217
+ "Signal Processing",
218
+ "Systems and Control",
219
+ ],
220
+ "Economics": ["Econometrics", "General Economics", "Theoretical Economics"],
221
  }
222
 
223
 
 
236
  raise RuntimeError(f"{category} is not a category of {topic}")
237
  papers = get_papers(abbr)
238
  papers = [
239
+ t
240
+ for t in papers
241
+ if bool(set(process_subject_fields(t["subjects"])) & set(categories))
242
+ ]
243
  else:
244
  papers = get_papers(abbr)
245
  if interest:
 
247
  papers,
248
  query={"interest": interest},
249
  threshold_score=threshold,
250
+ num_paper_in_prompt=16,
251
+ )
252
  body = "<br><br>".join(
253
+ [
254
+ f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}<br>Score: {paper["Relevancy score"]}<br>Reason: {paper["Reasons for match"]}'
255
+ for paper in relevancy
256
+ ]
257
+ )
258
  if hallucination:
259
+ body = (
260
+ "Warning: the model hallucinated some papers. We have tried to remove them, but the scores may not be accurate.<br><br>"
261
+ + body
262
+ )
263
  else:
264
  body = "<br><br>".join(
265
+ [
266
+ f'Title: <a href="{paper["main_page"]}">{paper["title"]}</a><br>Authors: {paper["authors"]}'
267
+ for paper in papers
268
+ ]
269
+ )
270
  return body
271
 
272
 
273
  if __name__ == "__main__":
274
+ # Load the .env file.
275
+ load_dotenv()
276
  parser = argparse.ArgumentParser()
277
+ parser.add_argument(
278
+ "--config", help="yaml config file to use", default="config.yaml"
279
+ )
280
  args = parser.parse_args()
281
  with open(args.config, "r") as f:
282
  config = yaml.safe_load(f)
283
+
284
  if "OPENAI_API_KEY" not in os.environ:
285
  raise RuntimeError("No openai api key found")
286
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
287
 
288
  topic = config["topic"]
289
  categories = config["categories"]
 
294
  body = generate_body(topic, categories, interest, threshold)
295
  with open("digest.html", "w") as f:
296
  f.write(body)
297
+ if os.environ.get("SENDGRID_API_KEY", None):
298
+ sg = SendGridAPIClient(api_key=os.environ.get("SENDGRID_API_KEY"))
299
  from_email = Email(from_email) # Change to your verified sender
300
  to_email = To(to_email)
301
  subject = date.today().strftime("Personalized arXiv Digest, %d %b %Y")