You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
3.5 KiB
108 lines
3.5 KiB
import pyperclip |
|
from pprint import pprint |
|
import requests |
|
import crossref_commons.retrieval |
|
from time import sleep |
|
from bs4 import BeautifulSoup |
|
import dl_elsy |
|
|
|
def download_file(doi, url): |
|
try: |
|
response = requests.get(url) |
|
response.raise_for_status() # Check if the request was successful |
|
content_type = response.headers['Content-Type'] |
|
|
|
if content_type == 'application/pdf': |
|
file_extension = 'pdf' |
|
elif content_type.startswith('text/'): |
|
file_extension = 'md' |
|
else: |
|
print(f"Unsupported content type: {content_type} for DOI: {doi}") |
|
return |
|
|
|
file_name = f"{doi}.{file_extension}".replace('/', '_') |
|
|
|
if file_extension == 'md': |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
print(soup.text) |
|
exit() |
|
|
|
with open(file_name, 'wb') as f: |
|
f.write(response.content) |
|
print(f"Downloaded {file_extension.upper()} for DOI: {doi}") |
|
|
|
except requests.exceptions.RequestException as e: |
|
print(f"Failed to download file for DOI: {doi}. Error: {e}") |
|
|
|
def get_article_info(doi): |
|
url = f'https://doaj.org/api/search/articles/{doi}' |
|
response = requests.get(url) |
|
|
|
if response.status_code == 200: |
|
data = response.json() |
|
for result in data.get('results', []): |
|
for link in result.get('bibjson', {}).get('link', []): |
|
if 'mdpi.com' in link['url']: |
|
r = requests.get(link['url']) |
|
soup = BeautifulSoup(r.content, 'html.parser') |
|
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) |
|
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] |
|
pdf = requests.get(pdf_url) |
|
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: |
|
f.write(pdf.content) |
|
sleep(1) |
|
epub = requests.get(link['url'] + '/epub') |
|
with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: |
|
f.write(epub.content) |
|
sleep(1) |
|
print(f'Downloaded PDF and EPUB for {doi}') |
|
elif 'sciencedirect.com' in link['url']: |
|
return dl_elsy.get_doc(doi) |
|
sleep(1) |
|
else: |
|
|
|
|
|
print(link['url']) |
|
input() |
|
return doi |
|
|
|
else: |
|
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") |
|
|
|
# Read DOIs from file |
|
|
|
with open('review_references.csv', 'r') as f: |
|
with open('review_references.txt', 'w') as f2: |
|
references = f.readlines() |
|
# Process each DOI |
|
with open('review_references.txt') as f2: |
|
ref_done = f2.readlines() |
|
|
|
|
|
|
|
for ref in references: |
|
doi = ref.strip() |
|
print('###', ref.upper()) |
|
try: |
|
cr = crossref_commons.retrieval.get_publication_as_json(doi) |
|
except ValueError: |
|
print(f"Error fetching metadata for DOI: {doi}") |
|
continue |
|
if 'sciencedirect.com' not in str(cr): |
|
continue |
|
|
|
if doi not in ref_done: |
|
sleep(1) |
|
r = dl_elsy.get_doc(doi) |
|
if r: |
|
with open('review_references.txt', 'a+') as f2: |
|
f2.write(f'{r}\n') |
|
|
|
exit() |
|
for ref in references: |
|
doi = ref.strip() |
|
with open('review_references.txt', 'a') as f2: |
|
|
|
r = get_article_info(doi) |
|
if r: |
|
f2.write(r)
|
|
|