You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

100 lines
3.4 KiB

from pprint import pprint
import requests
import crossref_commons.retrieval
from time import sleep
from bs4 import BeautifulSoup
from _arango import ArangoDB
arango = ArangoDB()
def download_file(doi, url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
content_type = response.headers['Content-Type']
if content_type == 'application/pdf':
file_extension = 'pdf'
elif content_type.startswith('text/'):
file_extension = 'md'
else:
print(f"Unsupported content type: {content_type} for DOI: {doi}")
return
file_name = f"{doi}.{file_extension}".replace('/', '_')
if file_extension == 'md':
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.text)
exit()
with open(file_name, 'wb') as f:
f.write(response.content)
print(f"Downloaded {file_extension.upper()} for DOI: {doi}")
except requests.exceptions.RequestException as e:
print(f"Failed to download file for DOI: {doi}. Error: {e}")
def info(doi):
arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True)
url = f'https://doaj.org/api/search/articles/{doi}'
response = requests.get(url)
if response.status_code == 200:
data = response.json()
for result in data.get('results', []):
for link in result.get('bibjson', {}).get('link', []):
if 'mdpi.com' in link['url']:
r = requests.get(link['url'])
soup = BeautifulSoup(r.content, 'html.parser')
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
pdf = requests.get(pdf_url)
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
f.write(pdf.content)
sleep(1)
print(f'Downloaded PDF for {doi}')
else:
print(link['url'])
user_input = input()
if user_input == '':
arango.db.collection('sci_articles_links_downloaded').insert({
'_key': arango.fix_key(doi),
'doi': doi,
'url': link['url']
})
return doi
else:
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
if __name__ == '__main__':
# Read DOIs from file
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
# Process each DOI
with open('review_references.txt') as f2:
ref_done = f2.readlines()
for ref in references:
doi = ref.strip()
print('###', ref.upper())
try:
cr = crossref_commons.retrieval.get_publication_as_json(doi)
except ValueError:
print(f"Error fetching metadata for DOI: {doi}")
continue
for ref in references:
doi = ref.strip()
with open('review_references.txt', 'a') as f2:
r = info(doi)
if r:
f2.write(r)