import pyperclip from pprint import pprint import requests import crossref_commons.retrieval from time import sleep from bs4 import BeautifulSoup import dl_elsy def download_file(doi, url): try: response = requests.get(url) response.raise_for_status() # Check if the request was successful content_type = response.headers['Content-Type'] if content_type == 'application/pdf': file_extension = 'pdf' elif content_type.startswith('text/'): file_extension = 'md' else: print(f"Unsupported content type: {content_type} for DOI: {doi}") return file_name = f"{doi}.{file_extension}".replace('/', '_') if file_extension == 'md': soup = BeautifulSoup(response.content, 'html.parser') print(soup.text) exit() with open(file_name, 'wb') as f: f.write(response.content) print(f"Downloaded {file_extension.upper()} for DOI: {doi}") except requests.exceptions.RequestException as e: print(f"Failed to download file for DOI: {doi}. Error: {e}") def get_article_info(doi): url = f'https://doaj.org/api/search/articles/{doi}' response = requests.get(url) if response.status_code == 200: data = response.json() for result in data.get('results', []): for link in result.get('bibjson', {}).get('link', []): if 'mdpi.com' in link['url']: r = requests.get(link['url']) soup = BeautifulSoup(r.content, 'html.parser') pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] pdf = requests.get(pdf_url) with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: f.write(pdf.content) sleep(1) epub = requests.get(link['url'] + '/epub') with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: f.write(epub.content) sleep(1) print(f'Downloaded PDF and EPUB for {doi}') elif 'sciencedirect.com' in link['url']: return dl_elsy.get_doc(doi) sleep(1) else: print(link['url']) input() return doi else: print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") # Read DOIs from file with open('review_references.csv', 'r') as f: with open('review_references.txt', 'w') as f2: references = f.readlines() # Process each DOI with open('review_references.txt') as f2: ref_done = f2.readlines() for ref in references: doi = ref.strip() print('###', ref.upper()) try: cr = crossref_commons.retrieval.get_publication_as_json(doi) except ValueError: print(f"Error fetching metadata for DOI: {doi}") continue if 'sciencedirect.com' not in str(cr): continue if doi not in ref_done: sleep(1) r = dl_elsy.get_doc(doi) if r: with open('review_references.txt', 'a+') as f2: f2.write(f'{r}\n') exit() for ref in references: doi = ref.strip() with open('review_references.txt', 'a') as f2: r = get_article_info(doi) if r: f2.write(r)