python automatically downloads sample code for paper from arxiv
- 2021-08-17 00:17:28
- OfStack
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/02/11 21:44
# @Author : dangxusheng
# @Email : dangxusheng163@163.com
# @File : download_by_href.py
'''
Automatically from arxiv.org Download literature
'''
import os
import os.path as osp
import requests
from lxml import etree
from pprint import pprint
import re
import time
import glob
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
"Host": 'arxiv.org'
}
HREF_CN = 'http://cn.arxiv.org/pdf/'
HREF_SRC = 'http://cn.arxiv.org/pdf/'
SAVE_PATH = '/media/dangxs/E/Paper/download_at_20200730'
os.makedirs(SAVE_PATH, exist_ok=True)
FAIL_URLS = []
FAIL_URLS_TXT = f'{SAVE_PATH}/fail_urls.txt'
def download(url, title):
pattern = r'[\\/:*?"\'<>|\r\n]+'
new_title = re.sub(pattern, " ", title)
print(f'new title: {new_title}')
save_filepath = '%s/%s.pdf' % (SAVE_PATH, new_title)
if osp.exists(save_filepath) and osp.getsize(save_filepath) > 50 * 1024:
print(f'this pdf is be existed.')
return True
try:
with open(save_filepath, 'wb') as file:
# Byte downloading
r = requests.get(url, stream=True, timeout=None)
for i in r.iter_content(2048):
file.write(i)
if osp.getsize(save_filepath) >= 10 * 1024:
print('%s Download succeeded .' % title)
return True
except Exception as e:
print(e)
return False
# From arxiv.org To download
def search(start_size=0, title_keywords='Facial Expression'):
# Access address : https://arxiv.org/find/grp_eess,grp_stat,grp_cs,grp_econ,grp_math/1/ti:+Face/0/1/0/past,2018,2019/0/1?skip=200&query_id=1c582e6c8afc6146&client_host=cn.arxiv.org
req_url = 'https://arxiv.org/search/advanced'
req_data = {
'advanced': 1,
'terms-0-operator': 'AND',
'terms-0-term': title_keywords,
'terms-0-field': 'title',
'classification-computer_science': 'y',
'classification-physics_archives': 'all',
'classification-include_cross_list': 'include',
'date-filter_by': 'date_range', # date_range | specific_year
# 'date-year': DOWN_YEAR,
'date-year': '',
'date-from_date': '2015',
'date-to_date': '2020',
'date-date_type': 'announced_date_first', # submitted_date | submitted_date_first | announced_date_first
'abstracts': 'show',
'size': 50,
'order': '-announced_date_first',
'start': start_size,
}
res = requests.get(req_url, params=req_data, headers=headers)
html = res.content.decode()
html = etree.HTML(html)
total_text = html.xpath('//h1[@class="title is-clearfix"]/text()')
total_text = ''.join(total_text).replace('\n', '').lstrip(' ').strip(' ')
# i.e. : Showing 1 In fact, in fact, the 50 of 355 results
num = re.findall('\d+', total_text)
# Sorry, your query returned no results
if len(num) == 0: return [], 0
total = int(num[-1]) # Total number of queries
paper_list = html.xpath('//ol[@class="breathe-horizontal"]/li')
info_list = []
for p in paper_list:
title = p.xpath('./p[@class="title is-5 mathjax"]//text()')
title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
href = p.xpath('./div/p/a/@href')[0]
info_list.append({'title': title, 'href': href})
return info_list, total
# Go to the specified page to download
def search_special():
res = requests.get('https://gitee.com/weberyoung/the-gan-zoo?_from=gitee_search')
html = res.content.decode()
html = etree.HTML(html)
paper_list = html.xpath('//div[@class="file_content markdown-body"]//li')
info_list = []
for p in paper_list:
title = p.xpath('.//text()')
title = ''.join(title).replace('\n', '').lstrip(' ').strip(' ')
href = p.xpath('./a/@href')[0]
info_list.append({'title': title, 'href': href})
pprint(info_list)
return info_list
if __name__ == '__main__':
page_idx = 0
total = 1000
keywords = 'Facial Action Unit'
while page_idx <= total // 50:
paper_list, total = search(page_idx * 50, keywords)
print(f'total: {total}')
if total == 0:
print('no found .')
exit(0)
for p in paper_list:
title = p['title']
href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
print(href)
if not download(href, title):
print(' Download from domestic image failed, download from source address >>>>')
# Use international URL Download again 1 Times
href = HREF_SRC + p['href'].split('/')[-1] + '.pdf'
if not download(href, title):
FAIL_URLS.append(p)
page_idx += 1
# Download the last part
last_1 = total - page_idx * 50
paper_list, total = search(last_1, keywords)
for p in paper_list:
title = p['title']
href = HREF_CN + p['href'].split('/')[-1] + '.pdf'
if not download(href, title):
FAIL_URLS.append(p)
time.sleep(1)
pprint(FAIL_URLS)
with open(FAIL_URLS_TXT, 'a+') as f:
for item in FAIL_URLS:
href = item['href']
title = item['title']
f.write(href + '\n')
print('done.')
These are the details of the sample code that python automatically downloads paper from arxiv. For more information about python downloading paper from arxiv, please pay attention to other related articles on this site!