-
Notifications
You must be signed in to change notification settings - Fork 2.8k
/
Copy pathfetch_papers.py
40 lines (38 loc) · 1.66 KB
/
fetch_papers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
'''
Author: doodhwala, leezeeyee
Python3 script to fetch papers
'''
import os, re, requests, codecs
filename='README.md'
directory = 'papers'
if not os.path.exists(directory):
os.makedirs(directory)
papers = []
with codecs.open(filename, encoding='utf-8', mode='r', buffering=1, errors='strict') as f:
lines = f.read().split('\n')
heading, section_path = '', ''
for line in lines:
if('## 20' in line):
heading = line.strip().split('##')[1]
win_restricted_chars = re.compile(r'[\^\/\\\:\*\?\"<>\|]')
heading = win_restricted_chars.sub("", heading)
section_path = os.path.join(directory, heading)
if not os.path.exists(section_path):
os.makedirs(section_path)
if('[`[pdf]`]' in line):
# The stars ensure you pick up only the top 100 papers
# Modify the expression if you want to fetch all other papers as well
result = re.search('(.*?)\[`\[pdf\]`\]\((.*?)\)', line)
if(result):
paper, url = result.groups()
paper = win_restricted_chars.sub("", paper)
paper=paper.strip('- ')
# Auto - resume functionality
if(not os.path.exists(os.path.join(section_path, paper + '.pdf'))):
print('Fetching', paper)
try:
response = requests.get(url)
with open(os.path.join(section_path, paper + '.pdf'), 'wb') as f:
f.write(response.content)
except requests.exceptions.RequestException as e:
print("Error: {}".format(e))