from pprint import pprint import requests import crossref_commons.retrieval from time import sleep from bs4 import BeautifulSoup from _arango import ArangoDB arango = ArangoDB() def download_file(doi, url): try: response = requests.get(url) response.raise_for_status() # Check if the request was successful content_type = response.headers['Content-Type'] if content_type == 'application/pdf': file_extension = 'pdf' elif content_type.startswith('text/'): file_extension = 'md' else: print(f"Unsupported content type: {content_type} for DOI: {doi}") return file_name = f"{doi}.{file_extension}".replace('/', '_') if file_extension == 'md': soup = BeautifulSoup(response.content, 'html.parser') print(soup.text) exit() with open(file_name, 'wb') as f: f.write(response.content) print(f"Downloaded {file_extension.upper()} for DOI: {doi}") except requests.exceptions.RequestException as e: print(f"Failed to download file for DOI: {doi}. Error: {e}") def info(doi): arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) url = f'https://doaj.org/api/search/articles/{doi}' response = requests.get(url) if response.status_code == 200: data = response.json() for result in data.get('results', []): for link in result.get('bibjson', {}).get('link', []): if 'mdpi.com' in link['url']: r = requests.get(link['url']) soup = BeautifulSoup(r.content, 'html.parser') pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] pdf = requests.get(pdf_url) with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: f.write(pdf.content) sleep(1) print(f'Downloaded PDF for {doi}') else: print(link['url']) user_input = input() if user_input == '': arango.db.collection('sci_articles_links_downloaded').insert({ '_key': arango.fix_key(doi), 'doi': doi, 'url': link['url'] }) return doi else: print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") if __name__ == '__main__': # Read DOIs from file with open('review_references.csv', 'r') as f: with open('review_references.txt', 'w') as f2: references = f.readlines() # Process each DOI with open('review_references.txt') as f2: ref_done = f2.readlines() for ref in references: doi = ref.strip() print('###', ref.upper()) try: cr = crossref_commons.retrieval.get_publication_as_json(doi) except ValueError: print(f"Error fetching metadata for DOI: {doi}") continue for ref in references: doi = ref.strip() with open('review_references.txt', 'a') as f2: r = info(doi) if r: f2.write(r)