From 58ef6941285827e4b629ca0dc2f05a79516e141f Mon Sep 17 00:00:00 2001 From: lasseedfast Date: Fri, 11 Oct 2024 16:52:59 +0200 Subject: [PATCH] Refactor chatbot.py and get_article_info.py - Refactor chatbot.py to use a more descriptive variable name for the chatbot instance. - Refactor get_article_info.py to use a more descriptive function name and remove unused imports. --- chatbot.py | 9 +++-- dl_article_libgen.py | 13 ------- get_article_info.py | 82 ++++++++++++++++++++------------------------ ingest_pdfs.py | 50 +++++++++++++++------------ scrape_html.py | 62 +++++++++++++++++++++++++++++++++ 5 files changed, 133 insertions(+), 83 deletions(-) delete mode 100644 dl_article_libgen.py create mode 100644 scrape_html.py diff --git a/chatbot.py b/chatbot.py index d6a054b..1545a3b 100644 --- a/chatbot.py +++ b/chatbot.py @@ -5,10 +5,12 @@ from pprint import pprint chromadb = ChromaDB() arango = ArangoDB() -llm = LLM(temperature=0.1) +chatbot = LLM(temperature=0.1) while True: user_input = "What problems are there in battery production?" # input("Enter a prompt: ") + + chunks = chromadb.db.get_collection('sci_articles').query(query_texts=user_input, n_results=7) combined_chunks = [ {"document": doc, "metadata": meta} @@ -42,7 +44,8 @@ while True: """ prompt = f'''{user_input} -Below are snippets from different articles with title and date of publication. ONLY use the information below to answer the question. Do not use any other information. +Below are snippets from different articles with title and date of publication. +ONLY use the information below to answer the question. Do not use any other information. """ {chunks_string} @@ -52,6 +55,6 @@ Below are snippets from different articles with title and date of publication. O ''' print(prompt) exit() - response = llm.generate(prompt) + response = chatbot.generate(prompt) print(response) print() \ No newline at end of file diff --git a/dl_article_libgen.py b/dl_article_libgen.py deleted file mode 100644 index f71f861..0000000 --- a/dl_article_libgen.py +++ /dev/null @@ -1,13 +0,0 @@ -import pyperclip - -with open('review_references.csv', 'r') as f: - with open('review_references.txt', 'w') as f2: - references = f.readlines() - for ref in references: - print(ref) - # Copy ref to clipboard - found = input("Found DOI? (y/n): ") - f2.write(f"{ref.strip()}: {found}\n") - - - diff --git a/get_article_info.py b/get_article_info.py index c68d7ed..421e099 100644 --- a/get_article_info.py +++ b/get_article_info.py @@ -1,10 +1,11 @@ -import pyperclip from pprint import pprint import requests import crossref_commons.retrieval from time import sleep from bs4 import BeautifulSoup -import dl_elsy +from _arango import ArangoDB + +arango = ArangoDB() def download_file(doi, url): try: @@ -34,10 +35,10 @@ def download_file(doi, url): except requests.exceptions.RequestException as e: print(f"Failed to download file for DOI: {doi}. Error: {e}") -def get_article_info(doi): +def info(doi): + arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) url = f'https://doaj.org/api/search/articles/{doi}' response = requests.get(url) - if response.status_code == 200: data = response.json() for result in data.get('results', []): @@ -51,58 +52,49 @@ def get_article_info(doi): with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: f.write(pdf.content) sleep(1) - epub = requests.get(link['url'] + '/epub') - with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: - f.write(epub.content) - sleep(1) - print(f'Downloaded PDF and EPUB for {doi}') - elif 'sciencedirect.com' in link['url']: - return dl_elsy.get_doc(doi) - sleep(1) - else: - + print(f'Downloaded PDF for {doi}') + else: print(link['url']) - input() + user_input = input() + if user_input == '': + arango.db.collection('sci_articles_links_downloaded').insert({ + '_key': arango.fix_key(doi), + 'doi': doi, + 'url': link['url'] + }) return doi else: print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") -# Read DOIs from file -with open('review_references.csv', 'r') as f: - with open('review_references.txt', 'w') as f2: - references = f.readlines() -# Process each DOI -with open('review_references.txt') as f2: - ref_done = f2.readlines() +if __name__ == '__main__': + # Read DOIs from file + + with open('review_references.csv', 'r') as f: + with open('review_references.txt', 'w') as f2: + references = f.readlines() + # Process each DOI + with open('review_references.txt') as f2: + ref_done = f2.readlines() -for ref in references: - doi = ref.strip() - print('###', ref.upper()) - try: - cr = crossref_commons.retrieval.get_publication_as_json(doi) - except ValueError: - print(f"Error fetching metadata for DOI: {doi}") - continue - if 'sciencedirect.com' not in str(cr): - continue - if doi not in ref_done: - sleep(1) - r = dl_elsy.get_doc(doi) - if r: - with open('review_references.txt', 'a+') as f2: - f2.write(f'{r}\n') + for ref in references: + doi = ref.strip() + print('###', ref.upper()) + try: + cr = crossref_commons.retrieval.get_publication_as_json(doi) + except ValueError: + print(f"Error fetching metadata for DOI: {doi}") + continue -exit() -for ref in references: - doi = ref.strip() - with open('review_references.txt', 'a') as f2: + for ref in references: + doi = ref.strip() + with open('review_references.txt', 'a') as f2: - r = get_article_info(doi) - if r: - f2.write(r) + r = info(doi) + if r: + f2.write(r) \ No newline at end of file diff --git a/ingest_pdfs.py b/ingest_pdfs.py index 9d0f5b2..4f97523 100644 --- a/ingest_pdfs.py +++ b/ingest_pdfs.py @@ -9,6 +9,14 @@ from semantic_text_splitter import MarkdownSplitter from _arango import ArangoDB from _chromadb import ChromaDB +arango = ArangoDB() +chromadb = ChromaDB() + +# Initialize the chroma database +chroma_col = chromadb.db.get_collection("sci_articles") +max_characters = 2200 +ts = MarkdownSplitter(max_characters) +path_folder = "sci_articles" def get_crossref(doi): try: @@ -57,48 +65,44 @@ def get_crossref(doi): return None -arango = ArangoDB() -#arango.db.collection("sci_articles").truncate() #! - -# Initialize the chroma database -chromadb = ChromaDB() -chroma_col = chromadb.db.get_or_create_collection("sci_articles") -#chromadb.db.delete_collection("sci_articles") #! -chroma_col = chromadb.db.get_or_create_collection("sci_articles") -max_characters = 2200 -ts = MarkdownSplitter(max_characters) - def extract_doi(text): # Define the regex pattern for DOI - doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Z0-9]+" + doi_pattern = r"10\.\d{4,9}/[-._;()/:A-Za-z0-9]+" # Find the first doi in the text, if there is any doi = re.search(doi_pattern, text) if doi: # Return the first doi found - return doi.group() + doi = doi.group() + doi = doi.strip('.').replace('.pdf', '') + return doi else: return None def process_pdf(pdf): - - pdf_path = os.path.join("sci_articles", pdf) + if '/' not in pdf: + pdf_path = os.path.join("sci_articles", pdf) + else: + pdf_path = pdf if extract_doi(pdf): doi = extract_doi(pdf) else: - text = pymupdf.get_text(pdf_path) + text = '\n'.join(pymupdf.get_text(pdf_path)) doi = extract_doi(text) if not doi: print(f"\nCould not find DOI for {pdf}\n") return - crossref_info = get_crossref(doi) if arango.db.collection("sci_articles").get(arango.fix_key(doi)): print(f"Article {doi} already in database") return - md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) + + # Get metadata from Crossref + crossref_info = get_crossref(doi) + # Extract text from PDF + md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True, show_progress=False) md_text = "" for page in md_pages: md_text += f"{page['text'].strip()}\n@{page['metadata']['page']}@\n" @@ -143,9 +147,9 @@ def process_pdf(pdf): ids.append(id) metadatas.append( { - "_key": pdf.strip(".pdf"), + "_key": arango.fix_key(doi), "doi": doi, - "file": pdf_path, + "file": f"sci_articles/{doi}.pdf", "chunk_nr": i, "pages": ",".join([str(i) for i in page_numbers]), } @@ -157,7 +161,7 @@ def process_pdf(pdf): arango_document = { "_key": arango.fix_key(doi), "doi": doi, - "file": pdf_path, + "file": f"sci_articles/{doi}.pdf", "chunks": arango_chunks, "text": md_text, "metadata": crossref_info, @@ -166,6 +170,7 @@ def process_pdf(pdf): arango_document, overwrite=True, overwrite_mode="update" ) print(f"Inserted article {doi} into database") + return doi def add_pdfs(path_folder): @@ -175,5 +180,6 @@ def add_pdfs(path_folder): if __name__ == "__main__": - path_folder = "sci_articles" + + add_pdfs(path_folder) diff --git a/scrape_html.py b/scrape_html.py new file mode 100644 index 0000000..478866b --- /dev/null +++ b/scrape_html.py @@ -0,0 +1,62 @@ +import os +from time import sleep +from bs4 import BeautifulSoup +import requests +import ingest_pdfs +import get_article_info +from _arango import ArangoDB + +arango = ArangoDB() + +bas_url = 'https://www.sciencedirect.com' +file = 'copied_html.html' +with open(file, 'r') as f: + html = f.read() + +soup = BeautifulSoup(html, 'html.parser') + +links = soup.find_all('a') + + + +for link in links: + doi = ingest_pdfs.extract_doi(str(link)) + if doi: + arango.db.collection('dois_checked').insert({'_key': arango.fix_key(doi), 'doi': doi}, overwrite=True) + else: + continue + print(f"DOI: {doi}") + if arango.db.collection("sci_articles").get(arango.fix_key(doi)): + print(f"Article {doi} already exists in database") + continue + elif arango.db.collection("sci_articles_links_downloaded").get(arango.fix_key(doi)): + print(f"Article {doi} already downloaded") + continue + get_article_info.info(doi) + + + +# # Find all tags +# links = soup.find_all('a', class_='anchor pdf link anchor-primary anchor-icon-left anchor-with-icon') + +# # Extract href for links with text "View PDF" +# pdf_links = [link['href'] for link in links if link.find('span', class_='anchor-text').text == 'View PDF'] + +# # Print the extracted links +# for pdf_link in pdf_links: +# sleep(1) +# # Get the full URL +# pdf_url = f"{bas_url}{pdf_link}" +# print(f"Downloading {pdf_url}") +# content = requests.get(pdf_url).content +# temp_filename = 'temp_articles/temp.pdf' +# with open(temp_filename, 'wb') as f: +# f.write(content) +# doi = ingest_pdfs.process_pdf(temp_filename) +# # Rename the file with the DOI and move to sci_articles +# if doi: +# new_filename = f"sci_articles/{doi}.pdf" +# os.rename(temp_filename, new_filename) +# print(f"Downloaded {new_filename}") +# else: +# os.remove(temp_filename)