Add code to download and process review references

This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future.
1 year ago · 911b8c33b0
parent 08e17d13a5
commit 911b8c33b0
15 changed files with 581 additions and 2 deletions
--- a/_arango.py
+++ b/_arango.py
@ -1,4 +1,4 @@
-
+import re
 from arango import ArangoClient
 from dotenv import load_dotenv
 import os
@ -53,3 +53,7 @@ class ArangoDB:
            if '/' in document_id:
                document_id = document_id.split('/')[-1]
            return self.db.collection('ev_speeches').get(document_id)
+
+    def fix_key(self, _key):
+
+        return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)
--- a/_chromadb.py
+++ b/_chromadb.py
@ -0,0 +1,14 @@
+import chromadb
+import os
+import pymupdf4llm
+from semantic_text_splitter import MarkdownSplitter
+from _arango import ArangoDB
+from pprint import pprint
+
+class ChromaDB:
+    def __init__(self):
+        self.db = chromadb.PersistentClient("chroma_db")
+        max_characters = 2200
+        self.ts = MarkdownSplitter(max_characters)
+        self.sci_articles = self.db.get_or_create_collection("sci_articles")
+
--- a/_llm.py
+++ b/_llm.py
@ -0,0 +1,26 @@
+from ollama import Client
+import os
+import env_manager
+env_manager.set_env()
+
+class LLM:
+    def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None:
+        self.llm_model = os.getenv("LLM_MODEL")
+        self.system_message = system_message
+        self.options = {"temperature": temperature, "num_ctx": num_ctx}
+        self.messages = [{'role': 'system', 'content': self.system_message}]
+        self.chat = chat
+        self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}')
+
+    def generate(self, prompt: str) -> str:
+        self.messages.append({"role": "user", "content": prompt})
+
+        result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, )
+
+        answer = result['message']['content']
+        self.messages.append({"role": "assistant", "content": answer})
+        if not self.chat:
+            self.messages = [{'role': 'system', 'content': self.system_message}]
+
+        return answer
+    
--- a/all_arguments.py
+++ b/all_arguments.py
--- a/analyze_speeches.py
+++ b/analyze_speeches.py
@ -2,7 +2,7 @@ from _llm import LLM
 from collections import Counter
 from dotenv import load_dotenv
 from _arango import ArangoDB
-from arguments import arguments as all_arguments
+from all_arguments import arguments as all_arguments
 from colorprinter.print_color import *
 import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans
--- a/chatbot.py
+++ b/chatbot.py
@ -0,0 +1,23 @@
+from _llm import LLM
+from _chromadb import ChromaDB
+
+chromadb = ChromaDB()
+llm = LLM(temperature=0.1)
+
+while True:
+    user_input = input("Enter a prompt: ")
+    chunks = chromadb.sci_articles.query(query_texts=user_input)
+    chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
+    prompt = f'''{user_input}
+    Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
+
+    """
+    {chunks_string}
+    """
+    
+    {user_input}
+
+    '''
+    response = llm.generate(prompt)
+    print(response)
+    print()
--- a/clean_csv.py
+++ b/clean_csv.py
@ -0,0 +1,15 @@
+import csv
+
+# Preprocess the CSV file to ensure consistent field counts
+input_file = 'speeches.csv'
+output_file = 'cleaned_speeches.csv'
+
+with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
+    reader = csv.reader(infile, delimiter=';', quotechar='"')
+    writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
+    
+    for row in reader:
+        if len(row) == 22:  # Ensure the row has the correct number of fields
+            writer.writerow(row)
+
+print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'")
--- a/create_chroma.py
+++ b/create_chroma.py
@ -0,0 +1,149 @@
+import re
+import chromadb
+import os
+import pymupdf4llm
+from semantic_text_splitter import MarkdownSplitter
+from _arango import ArangoDB
+from pprint import pprint
+import crossref_commons.retrieval as crossref
+import ebooklib
+from ebooklib import epub
+import nltk
+from bs4 import BeautifulSoup
+
+# from epub_conversion.utils import open_book, convert_epub_to_lines
+
+
+def get_crossref(doi):
+    try:
+        work = crossref.get_publication_as_json(doi)
+
+        # Determine the best publication date
+        if "published-print" in work:
+            publication_date = work["published-print"]["date-parts"][0]
+        elif "published-online" in work:
+            publication_date = work["published-online"]["date-parts"][0]
+        elif "issued" in work:
+            publication_date = work["issued"]["date-parts"][0]
+        else:
+            publication_date = [None]
+
+        metadata = {
+            "doi": work.get("DOI", None),
+            "title": work.get("title", [None])[
+                0
+            ],  # Extract the first title if available
+            "authors": [
+                f"{author['given']} {author['family']}"
+                for author in work.get("author", [])
+            ],
+            "abstract": work.get("abstract", None),
+            "journal": work.get("container-title", [None])[
+                0
+            ],  # Extract the first journal title if available
+            "volume": work.get("volume", None),
+            "issue": work.get("issue", None),
+            "pages": work.get("page", None),
+            "published_date": "-".join(
+                map(str, publication_date)
+            ),  # Join date parts with hyphens
+            "url_doi": work.get("URL", None),
+            "link": (
+                work.get("link", [None])[0]["URL"] if work.get("link", None) else None
+            ),
+            "language": work.get("language", None),
+        }
+        return metadata
+    except Exception as e:
+        print(f"Error retrieving metadata for DOI {doi}: {e}")
+        return None
+
+
+arango = ArangoDB()
+arango.db.collection("sci_articles").truncate()  #!
+
+# Initialize the chroma database
+db = chromadb.PersistentClient("chroma_db")
+col = db.get_or_create_collection("articles")
+db.delete_collection("articles")  #!
+col = db.get_or_create_collection("articles")
+max_characters = 2200
+ts = MarkdownSplitter(max_characters)
+
+
+def add_pdfs(path_folder):
+    pdf_in_folder = []
+    for file in os.listdir(path_folder):
+        if file.endswith(".pdf"):
+            pdf_in_folder.append(file)
+
+    for pdf in pdf_in_folder:
+        doi = pdf.strip(".pdf").replace("_", "/")
+        crossref_info = get_crossref(doi)
+
+        if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
+            print(f"Article {doi} already in database")
+            continue
+        pdf_path = os.path.join("sci_articles", pdf)
+        md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
+
+        md_text = ""
+        for page in md_pages:
+            md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
+
+        ids = []
+        documents = []
+        metadatas = []
+        better_chunks = []
+        chunks = ts.chunks(md_text)
+
+        # Merge chunks that are too short
+        for chunk in chunks:
+            if all(
+                [
+                    len(chunk) < int(max_characters / 3),  # TODO Are those values good?
+                    len(chunks[-1]) < int(max_characters * 1.5),
+                    len(better_chunks) > 0,
+                ]
+            ):
+                better_chunks[-1] += chunk
+            else:
+                better_chunks.append(chunks)
+        arango_chunks = []
+        last_page = 1
+        for i, chunk in enumerate(chunks):
+            page_numbers = re.findall(r"@(\d+)@", chunk)
+            if page_numbers == []:
+                page_numbers = [last_page]
+            else:
+                last_page = page_numbers[-1]
+            id = arango.fix_key(doi) + f"_{i}"
+            ids.append(id)
+            metadatas.append(
+                {
+                    "doi": pdf.strip(".pdf"),
+                    "file": pdf_path,
+                    "chunk_nr": i,
+                    "pages": ",".join([str(i) for i in page_numbers]),
+                }
+            )
+            chunk = re.sub(r"@(\d+)@", "", chunk)
+            documents.append(chunk)
+            arango_chunks.append({"text": chunk, "pages": page_numbers})
+        col.add(ids=ids, documents=documents, metadatas=metadatas)
+        arango_document = {
+            "_key": arango.fix_key(doi),
+            "doi": doi,
+            "file": pdf_path,
+            "chunks": arango_chunks,
+            "text": md_text,
+            "metadata": crossref_info,
+        }
+        arango.db.collection("sci_articles").insert(
+            arango_document, overwrite=True, overwrite_mode="update"
+        )
+        print(f"Inserted article {doi} into database")
+
+
+path_folder = "sci_articles"
+add_pdfs(path_folder)
--- a/dbx_test.py
+++ b/dbx_test.py
@ -0,0 +1,19 @@
+import dropbox
+
+# Replace with your access token
+ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc'
+
+# Initialize a Dropbox client
+dbx = dropbox.Dropbox(ACCESS_TOKEN)
+
+# Define the folder path
+folder_path = '/Filinlämningar/Electric Cars'
+
+# List all files in the root directory
+try:
+    result = dbx.files_list_folder(folder_path)
+    print(f"Files in the root directory:")
+    for entry in result.entries:
+        print(entry.name)
+except dropbox.exceptions.ApiError as err:
+    print(f"Failed to list folder contents: {err}")
--- a/dl_article_libgen.py
+++ b/dl_article_libgen.py
@ -0,0 +1,13 @@
+import pyperclip
+
+with open('review_references.csv', 'r') as f:
+    with open('review_references.txt', 'w') as f2:
+        references = f.readlines()
+        for ref in references:
+            print(ref)
+            # Copy ref to clipboard
+            found = input("Found DOI? (y/n): ")
+            f2.write(f"{ref.strip()}: {found}\n")
+
+
+
--- a/dl_elsy.py
+++ b/dl_elsy.py
@ -0,0 +1,26 @@
+"""An example program that uses the elsapy module"""
+
+from elsapy.elsclient import ElsClient
+from elsapy.elsprofile import ElsAuthor, ElsAffil
+from elsapy.elsdoc import FullDoc, AbsDoc
+from elsapy.elssearch import ElsSearch
+import json
+    
+## Load configuration
+con_file = open("config.json")
+config = json.load(con_file)
+con_file.close()
+
+## Initialize client
+client = ElsClient(config['apikey'])
+
+def get_doc(doi):
+    ## ScienceDirect (full-text) document example using DOI
+    doi_doc = FullDoc(doi = doi)
+    if doi_doc.read(client):
+        print ("doi_doc.title: ", doi_doc.title)
+        doi_doc.write()  
+        return doi 
+    else:
+        return None
+
--- a/explore_speakers.py
+++ b/explore_speakers.py
@ -0,0 +1,55 @@
+import pandas as pd
+from all_arguments import arguments as arguments_dict  # Arguments dictionary with sentiment information
+
+# Step 1: Read the CSV file
+df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
+
+print(df.head())
+
+# Step 2: Extract relevant columns
+# Assuming the arguments start from the 5th column onwards
+arguments = df.columns[5:]
+df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
+
+# Step 3: Create a binary matrix for arguments
+# Convert the argument columns to integers
+df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
+
+# Step 4: Calculate sentiment scores for each politician
+def calculate_sentiment_score(row):
+    score = 0
+    for arg in arguments:
+        if row[arg] > 0:
+            sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
+            if sentiment == 'positive':
+                score += 1
+            elif sentiment == 'negative':
+                score -= 1
+    return score
+
+df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
+
+# Step 5: Identify the top 3 most positive and negative politicians
+top_3_positive = df_arguments.nlargest(3, 'sentiment_score')
+top_3_negative = df_arguments.nsmallest(3, 'sentiment_score')
+
+# Step 6: Extract arguments used by these politicians
+def extract_arguments(df):
+    result = {}
+    for _, row in df.iterrows():
+        name = row['name']
+        used_arguments = [arg for arg in arguments if row[arg] > 0]
+        result[name] = used_arguments
+    return result
+
+positive_arguments = extract_arguments(top_3_positive)
+negative_arguments = extract_arguments(top_3_negative)
+
+# Print the results
+print("Top 3 Positive Politicians and their Arguments:")
+for name, args in positive_arguments.items():
+    print(f"{name}: {args}")
+
+print("\nTop 3 Negative Politicians and their Arguments:")
+for name, args in negative_arguments.items():
+    print(f"{name}: {args}")
--- a/get_article_info.py
+++ b/get_article_info.py
@ -0,0 +1,108 @@
+import pyperclip
+from pprint import pprint
+import requests
+import crossref_commons.retrieval
+from time import sleep
+from bs4 import BeautifulSoup
+import dl_elsy
+
+def download_file(doi, url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()  # Check if the request was successful
+        content_type = response.headers['Content-Type']
+        
+        if content_type == 'application/pdf':
+            file_extension = 'pdf'
+        elif content_type.startswith('text/'):
+            file_extension = 'md'
+        else:
+            print(f"Unsupported content type: {content_type} for DOI: {doi}")
+            return
+        
+        file_name = f"{doi}.{file_extension}".replace('/', '_')
+        
+        if file_extension == 'md':
+            soup = BeautifulSoup(response.content, 'html.parser')
+            print(soup.text)
+            exit()
+        
+        with open(file_name, 'wb') as f:
+            f.write(response.content)
+        print(f"Downloaded {file_extension.upper()} for DOI: {doi}")
+        
+    except requests.exceptions.RequestException as e:
+        print(f"Failed to download file for DOI: {doi}. Error: {e}")
+
+def get_article_info(doi):
+    url = f'https://doaj.org/api/search/articles/{doi}'
+    response = requests.get(url)
+    
+    if response.status_code == 200:
+        data = response.json()
+        for result in data.get('results', []):
+            for link in result.get('bibjson', {}).get('link', []):
+                if 'mdpi.com' in link['url']:
+                    r = requests.get(link['url'])
+                    soup = BeautifulSoup(r.content, 'html.parser')
+                    pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
+                    pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
+                    pdf = requests.get(pdf_url)
+                    with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
+                        f.write(pdf.content)
+                    sleep(1)
+                    epub = requests.get(link['url'] + '/epub')
+                    with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
+                        f.write(epub.content)
+                    sleep(1)
+                    print(f'Downloaded PDF and EPUB for {doi}')
+                elif 'sciencedirect.com' in link['url']:
+                    return dl_elsy.get_doc(doi)
+                    sleep(1)
+                else:
+
+
+                    print(link['url'])
+                    input()
+                return doi
+
+    else:
+        print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
+
+# Read DOIs from file
+
+with open('review_references.csv', 'r') as f:
+    with open('review_references.txt', 'w') as f2:
+        references = f.readlines()
+# Process each DOI
+with open('review_references.txt') as f2:
+    ref_done = f2.readlines()
+
+
+
+for ref in references:  
+    doi = ref.strip()
+    print('###', ref.upper())
+    try:
+        cr = crossref_commons.retrieval.get_publication_as_json(doi)
+    except ValueError:
+        print(f"Error fetching metadata for DOI: {doi}")
+        continue
+    if 'sciencedirect.com' not in str(cr):
+        continue
+
+    if doi not in ref_done:
+        sleep(1)
+        r = dl_elsy.get_doc(doi)
+        if r:
+            with open('review_references.txt', 'a+') as f2:
+                f2.write(f'{r}\n')
+
+exit()
+for ref in references:
+    doi = ref.strip()
+    with open('review_references.txt', 'a') as f2:
+
+        r = get_article_info(doi)
+        if r:
+            f2.write(r)
--- a/group_parties.py
+++ b/group_parties.py
@ -0,0 +1,47 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
+
+
+# Step 1: Read the CSV file
+df = pd.read_csv('speeches.csv', delimiter=';')
+
+# Step 2: Extract relevant columns
+# Assuming the arguments start from the 5th column onwards
+arguments = df.columns[5:]
+df_arguments = df[['_key', 'name'] + list(arguments)]
+
+# Step 3: Create a binary matrix for arguments
+# Convert the argument columns to integers
+df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
+
+# Step 4: Sum the arguments for each politician
+df_sum = df_arguments.groupby('name')[arguments].sum().reset_index()
+
+# Step 5: Plot the data
+plt.figure(figsize=(12, 8))
+sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True)
+plt.title('Arguments Used by Politicians')
+plt.xlabel('Arguments')
+plt.ylabel('Politicians')
+
+# Step 6: Color the x-axis labels based on sentiment
+ax = plt.gca()
+x_labels = ax.get_xticklabels()
+for label in x_labels:
+    argument = label.get_text()
+    sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral')
+    if sentiment == 'positive':
+        label.set_color('green')
+    elif sentiment == 'negative':
+        label.set_color('red')
+    else:
+        label.set_color('black')
+
+plt.xticks(rotation=45, ha='right')
+plt.tight_layout()
+
+# Save the plot instead of showing it
+plt.savefig('arguments_used_by_politicians.png')
+plt.close()
--- a/group_speakers_streamlit.py
+++ b/group_speakers_streamlit.py
@ -0,0 +1,80 @@
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans
+from sklearn.preprocessing import StandardScaler
+import streamlit as st
+from all_arguments import arguments as arguments_dict  # Arguments dictionary with sentiment information
+
+# Step 1: Read the CSV file
+df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
+
+# Step 2: Extract relevant columns
+# Assuming the arguments start from the 5th column onwards
+arguments = df.columns[5:]
+df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
+
+# Step 3: Create a binary matrix for arguments
+# Convert the argument columns to integers
+df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
+
+# Step 4: Calculate sentiment scores for each politician
+def calculate_sentiment_score(row):
+    score = 0
+    for arg in arguments:
+        if row[arg] > 0:
+            sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
+            if sentiment == 'positive':
+                score += 1
+            elif sentiment == 'negative':
+                score -= 1
+    return score
+
+df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
+
+# # Step 5: Standardize the data
+# scaler = StandardScaler()
+# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments])
+
+# Step 6: Dimensionality reduction using PCA
+pca = PCA(n_components=2)
+pca_result = pca.fit_transform(df_arguments[arguments])
+df_arguments['pca1'] = pca_result[:, 0]
+df_arguments['pca2'] = pca_result[:, 1]
+
+# Step 7: Examine loadings
+loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments)
+
+# Step 8: Perform clustering
+kmeans = KMeans(n_clusters=3)  # Adjust the number of clusters as needed
+df_arguments['cluster'] = kmeans.fit_predict(pca_result)
+
+# Streamlit app
+st.title('Politicians Grouped by Arguments Used and Sentiment Score')
+
+# Step 9: Plot the data with clusters using Plotly
+fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters',
+                 labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True})
+st.plotly_chart(fig)
+
+# Step 10: Visualize original arguments using Plotly
+fig = go.Figure()
+
+# Add arrows for loadings
+for argument in arguments:
+    fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']],
+                             mode='lines+text', text=[None, argument], textposition='top center',
+                             line=dict(color='red', width=2), showlegend=False))
+
+# Add scatter plot for politicians
+fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers',
+                         marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10),
+                         text=df_arguments['name'], hoverinfo='text'))
+
+fig.update_layout(title='PCA Biplot of Politicians and Arguments',
+                  xaxis_title='PCA Component 1',
+                  yaxis_title='PCA Component 2',
+                  showlegend=False)
+
+st.plotly_chart(fig)