From 911b8c33b046edde3ce4aad82ef200b31cbdf509 Mon Sep 17 00:00:00 2001 From: lasseedfast <> Date: Fri, 11 Oct 2024 08:46:03 +0200 Subject: [PATCH] Add code to download and process review references This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future. --- _arango.py | 6 +- _chromadb.py | 14 +++ _llm.py | 26 ++++++ arguments.py => all_arguments.py | 0 analyze_speeches.py | 2 +- chatbot.py | 23 +++++ clean_csv.py | 15 ++++ create_chroma.py | 149 +++++++++++++++++++++++++++++++ dbx_test.py | 19 ++++ dl_article_libgen.py | 13 +++ dl_elsy.py | 26 ++++++ explore_speakers.py | 55 ++++++++++++ get_article_info.py | 108 ++++++++++++++++++++++ group_parties.py | 47 ++++++++++ group_speakers_streamlit.py | 80 +++++++++++++++++ 15 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 _chromadb.py create mode 100644 _llm.py rename arguments.py => all_arguments.py (100%) create mode 100644 chatbot.py create mode 100644 clean_csv.py create mode 100644 create_chroma.py create mode 100644 dbx_test.py create mode 100644 dl_article_libgen.py create mode 100644 dl_elsy.py create mode 100644 explore_speakers.py create mode 100644 get_article_info.py create mode 100644 group_parties.py create mode 100644 group_speakers_streamlit.py diff --git a/_arango.py b/_arango.py index 4a643a3..113103a 100644 --- a/_arango.py +++ b/_arango.py @@ -1,4 +1,4 @@ - +import re from arango import ArangoClient from dotenv import load_dotenv import os @@ -53,3 +53,7 @@ class ArangoDB: if '/' in document_id: document_id = document_id.split('/')[-1] return self.db.collection('ev_speeches').get(document_id) + + def fix_key(self, _key): + + return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key) diff --git a/_chromadb.py b/_chromadb.py new file mode 100644 index 0000000..81f0979 --- /dev/null +++ b/_chromadb.py @@ -0,0 +1,14 @@ +import chromadb +import os +import pymupdf4llm +from semantic_text_splitter import MarkdownSplitter +from _arango import ArangoDB +from pprint import pprint + +class ChromaDB: + def __init__(self): + self.db = chromadb.PersistentClient("chroma_db") + max_characters = 2200 + self.ts = MarkdownSplitter(max_characters) + self.sci_articles = self.db.get_or_create_collection("sci_articles") + diff --git a/_llm.py b/_llm.py new file mode 100644 index 0000000..68194f8 --- /dev/null +++ b/_llm.py @@ -0,0 +1,26 @@ +from ollama import Client +import os +import env_manager +env_manager.set_env() + +class LLM: + def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None: + self.llm_model = os.getenv("LLM_MODEL") + self.system_message = system_message + self.options = {"temperature": temperature, "num_ctx": num_ctx} + self.messages = [{'role': 'system', 'content': self.system_message}] + self.chat = chat + self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}') + + def generate(self, prompt: str) -> str: + self.messages.append({"role": "user", "content": prompt}) + + result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, ) + + answer = result['message']['content'] + self.messages.append({"role": "assistant", "content": answer}) + if not self.chat: + self.messages = [{'role': 'system', 'content': self.system_message}] + + return answer + diff --git a/arguments.py b/all_arguments.py similarity index 100% rename from arguments.py rename to all_arguments.py diff --git a/analyze_speeches.py b/analyze_speeches.py index cb091ed..1667ff3 100644 --- a/analyze_speeches.py +++ b/analyze_speeches.py @@ -2,7 +2,7 @@ from _llm import LLM from collections import Counter from dotenv import load_dotenv from _arango import ArangoDB -from arguments import arguments as all_arguments +from all_arguments import arguments as all_arguments from colorprinter.print_color import * import matplotlib.pyplot as plt from sklearn.cluster import KMeans diff --git a/chatbot.py b/chatbot.py new file mode 100644 index 0000000..c75b942 --- /dev/null +++ b/chatbot.py @@ -0,0 +1,23 @@ +from _llm import LLM +from _chromadb import ChromaDB + +chromadb = ChromaDB() +llm = LLM(temperature=0.1) + +while True: + user_input = input("Enter a prompt: ") + chunks = chromadb.sci_articles.query(query_texts=user_input) + chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]]) + prompt = f'''{user_input} + Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information. + + """ + {chunks_string} + """ + + {user_input} + + ''' + response = llm.generate(prompt) + print(response) + print() \ No newline at end of file diff --git a/clean_csv.py b/clean_csv.py new file mode 100644 index 0000000..9ae68e9 --- /dev/null +++ b/clean_csv.py @@ -0,0 +1,15 @@ +import csv + +# Preprocess the CSV file to ensure consistent field counts +input_file = 'speeches.csv' +output_file = 'cleaned_speeches.csv' + +with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile: + reader = csv.reader(infile, delimiter=';', quotechar='"') + writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) + + for row in reader: + if len(row) == 22: # Ensure the row has the correct number of fields + writer.writerow(row) + +print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'") diff --git a/create_chroma.py b/create_chroma.py new file mode 100644 index 0000000..eca2cc6 --- /dev/null +++ b/create_chroma.py @@ -0,0 +1,149 @@ +import re +import chromadb +import os +import pymupdf4llm +from semantic_text_splitter import MarkdownSplitter +from _arango import ArangoDB +from pprint import pprint +import crossref_commons.retrieval as crossref +import ebooklib +from ebooklib import epub +import nltk +from bs4 import BeautifulSoup + +# from epub_conversion.utils import open_book, convert_epub_to_lines + + +def get_crossref(doi): + try: + work = crossref.get_publication_as_json(doi) + + # Determine the best publication date + if "published-print" in work: + publication_date = work["published-print"]["date-parts"][0] + elif "published-online" in work: + publication_date = work["published-online"]["date-parts"][0] + elif "issued" in work: + publication_date = work["issued"]["date-parts"][0] + else: + publication_date = [None] + + metadata = { + "doi": work.get("DOI", None), + "title": work.get("title", [None])[ + 0 + ], # Extract the first title if available + "authors": [ + f"{author['given']} {author['family']}" + for author in work.get("author", []) + ], + "abstract": work.get("abstract", None), + "journal": work.get("container-title", [None])[ + 0 + ], # Extract the first journal title if available + "volume": work.get("volume", None), + "issue": work.get("issue", None), + "pages": work.get("page", None), + "published_date": "-".join( + map(str, publication_date) + ), # Join date parts with hyphens + "url_doi": work.get("URL", None), + "link": ( + work.get("link", [None])[0]["URL"] if work.get("link", None) else None + ), + "language": work.get("language", None), + } + return metadata + except Exception as e: + print(f"Error retrieving metadata for DOI {doi}: {e}") + return None + + +arango = ArangoDB() +arango.db.collection("sci_articles").truncate() #! + +# Initialize the chroma database +db = chromadb.PersistentClient("chroma_db") +col = db.get_or_create_collection("articles") +db.delete_collection("articles") #! +col = db.get_or_create_collection("articles") +max_characters = 2200 +ts = MarkdownSplitter(max_characters) + + +def add_pdfs(path_folder): + pdf_in_folder = [] + for file in os.listdir(path_folder): + if file.endswith(".pdf"): + pdf_in_folder.append(file) + + for pdf in pdf_in_folder: + doi = pdf.strip(".pdf").replace("_", "/") + crossref_info = get_crossref(doi) + + if arango.db.collection("sci_articles").get(arango.fix_key(doi)): + print(f"Article {doi} already in database") + continue + pdf_path = os.path.join("sci_articles", pdf) + md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) + + md_text = "" + for page in md_pages: + md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" + + ids = [] + documents = [] + metadatas = [] + better_chunks = [] + chunks = ts.chunks(md_text) + + # Merge chunks that are too short + for chunk in chunks: + if all( + [ + len(chunk) < int(max_characters / 3), # TODO Are those values good? + len(chunks[-1]) < int(max_characters * 1.5), + len(better_chunks) > 0, + ] + ): + better_chunks[-1] += chunk + else: + better_chunks.append(chunks) + arango_chunks = [] + last_page = 1 + for i, chunk in enumerate(chunks): + page_numbers = re.findall(r"@(\d+)@", chunk) + if page_numbers == []: + page_numbers = [last_page] + else: + last_page = page_numbers[-1] + id = arango.fix_key(doi) + f"_{i}" + ids.append(id) + metadatas.append( + { + "doi": pdf.strip(".pdf"), + "file": pdf_path, + "chunk_nr": i, + "pages": ",".join([str(i) for i in page_numbers]), + } + ) + chunk = re.sub(r"@(\d+)@", "", chunk) + documents.append(chunk) + arango_chunks.append({"text": chunk, "pages": page_numbers}) + col.add(ids=ids, documents=documents, metadatas=metadatas) + arango_document = { + "_key": arango.fix_key(doi), + "doi": doi, + "file": pdf_path, + "chunks": arango_chunks, + "text": md_text, + "metadata": crossref_info, + } + arango.db.collection("sci_articles").insert( + arango_document, overwrite=True, overwrite_mode="update" + ) + print(f"Inserted article {doi} into database") + + +path_folder = "sci_articles" +add_pdfs(path_folder) diff --git a/dbx_test.py b/dbx_test.py new file mode 100644 index 0000000..d71b332 --- /dev/null +++ b/dbx_test.py @@ -0,0 +1,19 @@ +import dropbox + +# Replace with your access token +ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc' + +# Initialize a Dropbox client +dbx = dropbox.Dropbox(ACCESS_TOKEN) + +# Define the folder path +folder_path = '/Filinlämningar/Electric Cars' + +# List all files in the root directory +try: + result = dbx.files_list_folder(folder_path) + print(f"Files in the root directory:") + for entry in result.entries: + print(entry.name) +except dropbox.exceptions.ApiError as err: + print(f"Failed to list folder contents: {err}") \ No newline at end of file diff --git a/dl_article_libgen.py b/dl_article_libgen.py new file mode 100644 index 0000000..f71f861 --- /dev/null +++ b/dl_article_libgen.py @@ -0,0 +1,13 @@ +import pyperclip + +with open('review_references.csv', 'r') as f: + with open('review_references.txt', 'w') as f2: + references = f.readlines() + for ref in references: + print(ref) + # Copy ref to clipboard + found = input("Found DOI? (y/n): ") + f2.write(f"{ref.strip()}: {found}\n") + + + diff --git a/dl_elsy.py b/dl_elsy.py new file mode 100644 index 0000000..0b6d6e7 --- /dev/null +++ b/dl_elsy.py @@ -0,0 +1,26 @@ +"""An example program that uses the elsapy module""" + +from elsapy.elsclient import ElsClient +from elsapy.elsprofile import ElsAuthor, ElsAffil +from elsapy.elsdoc import FullDoc, AbsDoc +from elsapy.elssearch import ElsSearch +import json + +## Load configuration +con_file = open("config.json") +config = json.load(con_file) +con_file.close() + +## Initialize client +client = ElsClient(config['apikey']) + +def get_doc(doi): + ## ScienceDirect (full-text) document example using DOI + doi_doc = FullDoc(doi = doi) + if doi_doc.read(client): + print ("doi_doc.title: ", doi_doc.title) + doi_doc.write() + return doi + else: + return None + diff --git a/explore_speakers.py b/explore_speakers.py new file mode 100644 index 0000000..566912c --- /dev/null +++ b/explore_speakers.py @@ -0,0 +1,55 @@ +import pandas as pd +from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information + +# Step 1: Read the CSV file +df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') + +print(df.head()) + +# Step 2: Extract relevant columns +# Assuming the arguments start from the 5th column onwards +arguments = df.columns[5:] +df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] + +# Step 3: Create a binary matrix for arguments +# Convert the argument columns to integers +df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) + +# Step 4: Calculate sentiment scores for each politician +def calculate_sentiment_score(row): + score = 0 + for arg in arguments: + if row[arg] > 0: + sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') + if sentiment == 'positive': + score += 1 + elif sentiment == 'negative': + score -= 1 + return score + +df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) + +# Step 5: Identify the top 3 most positive and negative politicians +top_3_positive = df_arguments.nlargest(3, 'sentiment_score') +top_3_negative = df_arguments.nsmallest(3, 'sentiment_score') + +# Step 6: Extract arguments used by these politicians +def extract_arguments(df): + result = {} + for _, row in df.iterrows(): + name = row['name'] + used_arguments = [arg for arg in arguments if row[arg] > 0] + result[name] = used_arguments + return result + +positive_arguments = extract_arguments(top_3_positive) +negative_arguments = extract_arguments(top_3_negative) + +# Print the results +print("Top 3 Positive Politicians and their Arguments:") +for name, args in positive_arguments.items(): + print(f"{name}: {args}") + +print("\nTop 3 Negative Politicians and their Arguments:") +for name, args in negative_arguments.items(): + print(f"{name}: {args}") \ No newline at end of file diff --git a/get_article_info.py b/get_article_info.py new file mode 100644 index 0000000..c68d7ed --- /dev/null +++ b/get_article_info.py @@ -0,0 +1,108 @@ +import pyperclip +from pprint import pprint +import requests +import crossref_commons.retrieval +from time import sleep +from bs4 import BeautifulSoup +import dl_elsy + +def download_file(doi, url): + try: + response = requests.get(url) + response.raise_for_status() # Check if the request was successful + content_type = response.headers['Content-Type'] + + if content_type == 'application/pdf': + file_extension = 'pdf' + elif content_type.startswith('text/'): + file_extension = 'md' + else: + print(f"Unsupported content type: {content_type} for DOI: {doi}") + return + + file_name = f"{doi}.{file_extension}".replace('/', '_') + + if file_extension == 'md': + soup = BeautifulSoup(response.content, 'html.parser') + print(soup.text) + exit() + + with open(file_name, 'wb') as f: + f.write(response.content) + print(f"Downloaded {file_extension.upper()} for DOI: {doi}") + + except requests.exceptions.RequestException as e: + print(f"Failed to download file for DOI: {doi}. Error: {e}") + +def get_article_info(doi): + url = f'https://doaj.org/api/search/articles/{doi}' + response = requests.get(url) + + if response.status_code == 200: + data = response.json() + for result in data.get('results', []): + for link in result.get('bibjson', {}).get('link', []): + if 'mdpi.com' in link['url']: + r = requests.get(link['url']) + soup = BeautifulSoup(r.content, 'html.parser') + pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) + pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] + pdf = requests.get(pdf_url) + with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: + f.write(pdf.content) + sleep(1) + epub = requests.get(link['url'] + '/epub') + with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: + f.write(epub.content) + sleep(1) + print(f'Downloaded PDF and EPUB for {doi}') + elif 'sciencedirect.com' in link['url']: + return dl_elsy.get_doc(doi) + sleep(1) + else: + + + print(link['url']) + input() + return doi + + else: + print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") + +# Read DOIs from file + +with open('review_references.csv', 'r') as f: + with open('review_references.txt', 'w') as f2: + references = f.readlines() +# Process each DOI +with open('review_references.txt') as f2: + ref_done = f2.readlines() + + + +for ref in references: + doi = ref.strip() + print('###', ref.upper()) + try: + cr = crossref_commons.retrieval.get_publication_as_json(doi) + except ValueError: + print(f"Error fetching metadata for DOI: {doi}") + continue + if 'sciencedirect.com' not in str(cr): + continue + + if doi not in ref_done: + sleep(1) + r = dl_elsy.get_doc(doi) + if r: + with open('review_references.txt', 'a+') as f2: + f2.write(f'{r}\n') + +exit() +for ref in references: + doi = ref.strip() + with open('review_references.txt', 'a') as f2: + + r = get_article_info(doi) + if r: + f2.write(r) diff --git a/group_parties.py b/group_parties.py new file mode 100644 index 0000000..9ebc993 --- /dev/null +++ b/group_parties.py @@ -0,0 +1,47 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information + + +# Step 1: Read the CSV file +df = pd.read_csv('speeches.csv', delimiter=';') + +# Step 2: Extract relevant columns +# Assuming the arguments start from the 5th column onwards +arguments = df.columns[5:] +df_arguments = df[['_key', 'name'] + list(arguments)] + +# Step 3: Create a binary matrix for arguments +# Convert the argument columns to integers +df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) + +# Step 4: Sum the arguments for each politician +df_sum = df_arguments.groupby('name')[arguments].sum().reset_index() + +# Step 5: Plot the data +plt.figure(figsize=(12, 8)) +sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True) +plt.title('Arguments Used by Politicians') +plt.xlabel('Arguments') +plt.ylabel('Politicians') + +# Step 6: Color the x-axis labels based on sentiment +ax = plt.gca() +x_labels = ax.get_xticklabels() +for label in x_labels: + argument = label.get_text() + sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral') + if sentiment == 'positive': + label.set_color('green') + elif sentiment == 'negative': + label.set_color('red') + else: + label.set_color('black') + +plt.xticks(rotation=45, ha='right') +plt.tight_layout() + +# Save the plot instead of showing it +plt.savefig('arguments_used_by_politicians.png') +plt.close() \ No newline at end of file diff --git a/group_speakers_streamlit.py b/group_speakers_streamlit.py new file mode 100644 index 0000000..ea7002a --- /dev/null +++ b/group_speakers_streamlit.py @@ -0,0 +1,80 @@ +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from sklearn.decomposition import PCA +from sklearn.cluster import KMeans +from sklearn.preprocessing import StandardScaler +import streamlit as st +from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information + +# Step 1: Read the CSV file +df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') + +# Step 2: Extract relevant columns +# Assuming the arguments start from the 5th column onwards +arguments = df.columns[5:] +df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] + +# Step 3: Create a binary matrix for arguments +# Convert the argument columns to integers +df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) + +# Step 4: Calculate sentiment scores for each politician +def calculate_sentiment_score(row): + score = 0 + for arg in arguments: + if row[arg] > 0: + sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') + if sentiment == 'positive': + score += 1 + elif sentiment == 'negative': + score -= 1 + return score + +df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) + +# # Step 5: Standardize the data +# scaler = StandardScaler() +# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments]) + +# Step 6: Dimensionality reduction using PCA +pca = PCA(n_components=2) +pca_result = pca.fit_transform(df_arguments[arguments]) +df_arguments['pca1'] = pca_result[:, 0] +df_arguments['pca2'] = pca_result[:, 1] + +# Step 7: Examine loadings +loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments) + +# Step 8: Perform clustering +kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed +df_arguments['cluster'] = kmeans.fit_predict(pca_result) + +# Streamlit app +st.title('Politicians Grouped by Arguments Used and Sentiment Score') + +# Step 9: Plot the data with clusters using Plotly +fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters', + labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True}) +st.plotly_chart(fig) + +# Step 10: Visualize original arguments using Plotly +fig = go.Figure() + +# Add arrows for loadings +for argument in arguments: + fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']], + mode='lines+text', text=[None, argument], textposition='top center', + line=dict(color='red', width=2), showlegend=False)) + +# Add scatter plot for politicians +fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers', + marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10), + text=df_arguments['name'], hoverinfo='text')) + +fig.update_layout(title='PCA Biplot of Politicians and Arguments', + xaxis_title='PCA Component 1', + yaxis_title='PCA Component 2', + showlegend=False) + +st.plotly_chart(fig) \ No newline at end of file