Add code to download and process review references

This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future.
1 year ago · 911b8c33b0
parent 08e17d13a5
commit 911b8c33b0
15 changed files with 581 additions and 2 deletions
--- a/_arango.py
+++ b/_arango.py
@ -1,4 +1,4 @@
-
+import re
 from arango import ArangoClient
 from dotenv import load_dotenv
 import os
@ -53,3 +53,7 @@ class ArangoDB:
            if '/' in document_id:
                document_id = document_id.split('/')[-1]
            return self.db.collection('ev_speeches').get(document_id)
    def fix_key(self, _key):
        return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)
--- a/_chromadb.py
+++ b/_chromadb.py
@ -0,0 +1,14 @@
 import chromadb
 import os
 import pymupdf4llm
 from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
 from pprint import pprint
 class ChromaDB:
    def __init__(self):
        self.db = chromadb.PersistentClient("chroma_db")
        max_characters = 2200
        self.ts = MarkdownSplitter(max_characters)
        self.sci_articles = self.db.get_or_create_collection("sci_articles")
--- a/_llm.py
+++ b/_llm.py
@ -0,0 +1,26 @@
 from ollama import Client
 import os
 import env_manager
 env_manager.set_env()
 class LLM:
    def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None:
        self.llm_model = os.getenv("LLM_MODEL")
        self.system_message = system_message
        self.options = {"temperature": temperature, "num_ctx": num_ctx}
        self.messages = [{'role': 'system', 'content': self.system_message}]
        self.chat = chat
        self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}')
    def generate(self, prompt: str) -> str:
        self.messages.append({"role": "user", "content": prompt})
        result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, )
        answer = result['message']['content']
        self.messages.append({"role": "assistant", "content": answer})
        if not self.chat:
            self.messages = [{'role': 'system', 'content': self.system_message}]
        return answer
--- a/all_arguments.py
+++ b/all_arguments.py
--- a/analyze_speeches.py
+++ b/analyze_speeches.py
@ -2,7 +2,7 @@ from _llm import LLM
 from collections import Counter
 from dotenv import load_dotenv
 from _arango import ArangoDB
-from arguments import arguments as all_arguments
+from all_arguments import arguments as all_arguments
 from colorprinter.print_color import *
 import matplotlib.pyplot as plt
 from sklearn.cluster import KMeans
--- a/chatbot.py
+++ b/chatbot.py
@ -0,0 +1,23 @@
 from _llm import LLM
 from _chromadb import ChromaDB
 chromadb = ChromaDB()
 llm = LLM(temperature=0.1)
 while True:
    user_input = input("Enter a prompt: ")
    chunks = chromadb.sci_articles.query(query_texts=user_input)
    chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
    prompt = f'''{user_input}
    Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
    """
    {chunks_string}
    """
    {user_input}
    '''
    response = llm.generate(prompt)
    print(response)
    print()
--- a/clean_csv.py
+++ b/clean_csv.py
@ -0,0 +1,15 @@
 import csv
 # Preprocess the CSV file to ensure consistent field counts
 input_file = 'speeches.csv'
 output_file = 'cleaned_speeches.csv'
 with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile, delimiter=';', quotechar='"')
    writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        if len(row) == 22:  # Ensure the row has the correct number of fields
            writer.writerow(row)
 print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'")
--- a/create_chroma.py
+++ b/create_chroma.py
@ -0,0 +1,149 @@
 import re
 import chromadb
 import os
 import pymupdf4llm
 from semantic_text_splitter import MarkdownSplitter
 from _arango import ArangoDB
 from pprint import pprint
 import crossref_commons.retrieval as crossref
 import ebooklib
 from ebooklib import epub
 import nltk
 from bs4 import BeautifulSoup
 # from epub_conversion.utils import open_book, convert_epub_to_lines
 def get_crossref(doi):
    try:
        work = crossref.get_publication_as_json(doi)
        # Determine the best publication date
        if "published-print" in work:
            publication_date = work["published-print"]["date-parts"][0]
        elif "published-online" in work:
            publication_date = work["published-online"]["date-parts"][0]
        elif "issued" in work:
            publication_date = work["issued"]["date-parts"][0]
        else:
            publication_date = [None]
        metadata = {
            "doi": work.get("DOI", None),
            "title": work.get("title", [None])[
                0
            ],  # Extract the first title if available
            "authors": [
                f"{author['given']} {author['family']}"
                for author in work.get("author", [])
            ],
            "abstract": work.get("abstract", None),
            "journal": work.get("container-title", [None])[
                0
            ],  # Extract the first journal title if available
            "volume": work.get("volume", None),
            "issue": work.get("issue", None),
            "pages": work.get("page", None),
            "published_date": "-".join(
                map(str, publication_date)
            ),  # Join date parts with hyphens
            "url_doi": work.get("URL", None),
            "link": (
                work.get("link", [None])[0]["URL"] if work.get("link", None) else None
            ),
            "language": work.get("language", None),
        }
        return metadata
    except Exception as e:
        print(f"Error retrieving metadata for DOI {doi}: {e}")
        return None
 arango = ArangoDB()
 arango.db.collection("sci_articles").truncate()  #!
 # Initialize the chroma database
 db = chromadb.PersistentClient("chroma_db")
 col = db.get_or_create_collection("articles")
 db.delete_collection("articles")  #!
 col = db.get_or_create_collection("articles")
 max_characters = 2200
 ts = MarkdownSplitter(max_characters)
 def add_pdfs(path_folder):
    pdf_in_folder = []
    for file in os.listdir(path_folder):
        if file.endswith(".pdf"):
            pdf_in_folder.append(file)
    for pdf in pdf_in_folder:
        doi = pdf.strip(".pdf").replace("_", "/")
        crossref_info = get_crossref(doi)
        if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
            print(f"Article {doi} already in database")
            continue
        pdf_path = os.path.join("sci_articles", pdf)
        md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
        md_text = ""
        for page in md_pages:
            md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
        ids = []
        documents = []
        metadatas = []
        better_chunks = []
        chunks = ts.chunks(md_text)
        # Merge chunks that are too short
        for chunk in chunks:
            if all(
                [
                    len(chunk) < int(max_characters / 3),  # TODO Are those values good?
                    len(chunks[-1]) < int(max_characters * 1.5),
                    len(better_chunks) > 0,
                ]
            ):
                better_chunks[-1] += chunk
            else:
                better_chunks.append(chunks)
        arango_chunks = []
        last_page = 1
        for i, chunk in enumerate(chunks):
            page_numbers = re.findall(r"@(\d+)@", chunk)
            if page_numbers == []:
                page_numbers = [last_page]
            else:
                last_page = page_numbers[-1]
            id = arango.fix_key(doi) + f"_{i}"
            ids.append(id)
            metadatas.append(
                {
                    "doi": pdf.strip(".pdf"),
                    "file": pdf_path,
                    "chunk_nr": i,
                    "pages": ",".join([str(i) for i in page_numbers]),
                }
            )
            chunk = re.sub(r"@(\d+)@", "", chunk)
            documents.append(chunk)
            arango_chunks.append({"text": chunk, "pages": page_numbers})
        col.add(ids=ids, documents=documents, metadatas=metadatas)
        arango_document = {
            "_key": arango.fix_key(doi),
            "doi": doi,
            "file": pdf_path,
            "chunks": arango_chunks,
            "text": md_text,
            "metadata": crossref_info,
        }
        arango.db.collection("sci_articles").insert(
            arango_document, overwrite=True, overwrite_mode="update"
        )
        print(f"Inserted article {doi} into database")
 path_folder = "sci_articles"
 add_pdfs(path_folder)
--- a/dbx_test.py
+++ b/dbx_test.py
@ -0,0 +1,19 @@
 import dropbox
 # Replace with your access token
 ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc'
 # Initialize a Dropbox client
 dbx = dropbox.Dropbox(ACCESS_TOKEN)
 # Define the folder path
 folder_path = '/Filinlämningar/Electric Cars'
 # List all files in the root directory
 try:
    result = dbx.files_list_folder(folder_path)
    print(f"Files in the root directory:")
    for entry in result.entries:
        print(entry.name)
 except dropbox.exceptions.ApiError as err:
    print(f"Failed to list folder contents: {err}")
--- a/dl_article_libgen.py
+++ b/dl_article_libgen.py
@ -0,0 +1,13 @@
 import pyperclip
 with open('review_references.csv', 'r') as f:
    with open('review_references.txt', 'w') as f2:
        references = f.readlines()
        for ref in references:
            print(ref)
            # Copy ref to clipboard
            found = input("Found DOI? (y/n): ")
            f2.write(f"{ref.strip()}: {found}\n")
--- a/dl_elsy.py
+++ b/dl_elsy.py
@ -0,0 +1,26 @@
 """An example program that uses the elsapy module"""
 from elsapy.elsclient import ElsClient
 from elsapy.elsprofile import ElsAuthor, ElsAffil
 from elsapy.elsdoc import FullDoc, AbsDoc
 from elsapy.elssearch import ElsSearch
 import json
 ## Load configuration
 con_file = open("config.json")
 config = json.load(con_file)
 con_file.close()
 ## Initialize client
 client = ElsClient(config['apikey'])
 def get_doc(doi):
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi = doi)
    if doi_doc.read(client):
        print ("doi_doc.title: ", doi_doc.title)
        doi_doc.write()  
        return doi 
    else:
        return None
--- a/explore_speakers.py
+++ b/explore_speakers.py
@ -0,0 +1,55 @@
 import pandas as pd
 from all_arguments import arguments as arguments_dict  # Arguments dictionary with sentiment information
 # Step 1: Read the CSV file
 df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
 print(df.head())
 # Step 2: Extract relevant columns
 # Assuming the arguments start from the 5th column onwards
 arguments = df.columns[5:]
 df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
 # Step 3: Create a binary matrix for arguments
 # Convert the argument columns to integers
 df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
 # Step 4: Calculate sentiment scores for each politician
 def calculate_sentiment_score(row):
    score = 0
    for arg in arguments:
        if row[arg] > 0:
            sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
            if sentiment == 'positive':
                score += 1
            elif sentiment == 'negative':
                score -= 1
    return score
 df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
 # Step 5: Identify the top 3 most positive and negative politicians
 top_3_positive = df_arguments.nlargest(3, 'sentiment_score')
 top_3_negative = df_arguments.nsmallest(3, 'sentiment_score')
 # Step 6: Extract arguments used by these politicians
 def extract_arguments(df):
    result = {}
    for _, row in df.iterrows():
        name = row['name']
        used_arguments = [arg for arg in arguments if row[arg] > 0]
        result[name] = used_arguments
    return result
 positive_arguments = extract_arguments(top_3_positive)
 negative_arguments = extract_arguments(top_3_negative)
 # Print the results
 print("Top 3 Positive Politicians and their Arguments:")
 for name, args in positive_arguments.items():
    print(f"{name}: {args}")
 print("\nTop 3 Negative Politicians and their Arguments:")
 for name, args in negative_arguments.items():
    print(f"{name}: {args}")
--- a/get_article_info.py
+++ b/get_article_info.py
@ -0,0 +1,108 @@
 import pyperclip
 from pprint import pprint
 import requests
 import crossref_commons.retrieval
 from time import sleep
 from bs4 import BeautifulSoup
 import dl_elsy
 def download_file(doi, url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
        content_type = response.headers['Content-Type']
        if content_type == 'application/pdf':
            file_extension = 'pdf'
        elif content_type.startswith('text/'):
            file_extension = 'md'
        else:
            print(f"Unsupported content type: {content_type} for DOI: {doi}")
            return
        file_name = f"{doi}.{file_extension}".replace('/', '_')
        if file_extension == 'md':
            soup = BeautifulSoup(response.content, 'html.parser')
            print(soup.text)
            exit()
        with open(file_name, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded {file_extension.upper()} for DOI: {doi}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download file for DOI: {doi}. Error: {e}")
 def get_article_info(doi):
    url = f'https://doaj.org/api/search/articles/{doi}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        for result in data.get('results', []):
            for link in result.get('bibjson', {}).get('link', []):
                if 'mdpi.com' in link['url']:
                    r = requests.get(link['url'])
                    soup = BeautifulSoup(r.content, 'html.parser')
                    pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
                    pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
                    pdf = requests.get(pdf_url)
                    with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
                        f.write(pdf.content)
                    sleep(1)
                    epub = requests.get(link['url'] + '/epub')
                    with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
                        f.write(epub.content)
                    sleep(1)
                    print(f'Downloaded PDF and EPUB for {doi}')
                elif 'sciencedirect.com' in link['url']:
                    return dl_elsy.get_doc(doi)
                    sleep(1)
                else:
                    print(link['url'])
                    input()
                return doi
    else:
        print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
 # Read DOIs from file
 with open('review_references.csv', 'r') as f:
    with open('review_references.txt', 'w') as f2:
        references = f.readlines()
 # Process each DOI
 with open('review_references.txt') as f2:
    ref_done = f2.readlines()
 for ref in references:  
    doi = ref.strip()
    print('###', ref.upper())
    try:
        cr = crossref_commons.retrieval.get_publication_as_json(doi)
    except ValueError:
        print(f"Error fetching metadata for DOI: {doi}")
        continue
    if 'sciencedirect.com' not in str(cr):
        continue
    if doi not in ref_done:
        sleep(1)
        r = dl_elsy.get_doc(doi)
        if r:
            with open('review_references.txt', 'a+') as f2:
                f2.write(f'{r}\n')
 exit()
 for ref in references:
    doi = ref.strip()
    with open('review_references.txt', 'a') as f2:
        r = get_article_info(doi)
        if r:
            f2.write(r)
--- a/group_parties.py
+++ b/group_parties.py
@ -0,0 +1,47 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
 # Step 1: Read the CSV file
 df = pd.read_csv('speeches.csv', delimiter=';')
 # Step 2: Extract relevant columns
 # Assuming the arguments start from the 5th column onwards
 arguments = df.columns[5:]
 df_arguments = df[['_key', 'name'] + list(arguments)]
 # Step 3: Create a binary matrix for arguments
 # Convert the argument columns to integers
 df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
 # Step 4: Sum the arguments for each politician
 df_sum = df_arguments.groupby('name')[arguments].sum().reset_index()
 # Step 5: Plot the data
 plt.figure(figsize=(12, 8))
 sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True)
 plt.title('Arguments Used by Politicians')
 plt.xlabel('Arguments')
 plt.ylabel('Politicians')
 # Step 6: Color the x-axis labels based on sentiment
 ax = plt.gca()
 x_labels = ax.get_xticklabels()
 for label in x_labels:
    argument = label.get_text()
    sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral')
    if sentiment == 'positive':
        label.set_color('green')
    elif sentiment == 'negative':
        label.set_color('red')
    else:
        label.set_color('black')
 plt.xticks(rotation=45, ha='right')
 plt.tight_layout()
 # Save the plot instead of showing it
 plt.savefig('arguments_used_by_politicians.png')
 plt.close()
--- a/group_speakers_streamlit.py
+++ b/group_speakers_streamlit.py
@ -0,0 +1,80 @@
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from sklearn.decomposition import PCA
 from sklearn.cluster import KMeans
 from sklearn.preprocessing import StandardScaler
 import streamlit as st
 from all_arguments import arguments as arguments_dict  # Arguments dictionary with sentiment information
 # Step 1: Read the CSV file
 df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
 # Step 2: Extract relevant columns
 # Assuming the arguments start from the 5th column onwards
 arguments = df.columns[5:]
 df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
 # Step 3: Create a binary matrix for arguments
 # Convert the argument columns to integers
 df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
 # Step 4: Calculate sentiment scores for each politician
 def calculate_sentiment_score(row):
    score = 0
    for arg in arguments:
        if row[arg] > 0:
            sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
            if sentiment == 'positive':
                score += 1
            elif sentiment == 'negative':
                score -= 1
    return score
 df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
 # # Step 5: Standardize the data
 # scaler = StandardScaler()
 # df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments])
 # Step 6: Dimensionality reduction using PCA
 pca = PCA(n_components=2)
 pca_result = pca.fit_transform(df_arguments[arguments])
 df_arguments['pca1'] = pca_result[:, 0]
 df_arguments['pca2'] = pca_result[:, 1]
 # Step 7: Examine loadings
 loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments)
 # Step 8: Perform clustering
 kmeans = KMeans(n_clusters=3)  # Adjust the number of clusters as needed
 df_arguments['cluster'] = kmeans.fit_predict(pca_result)
 # Streamlit app
 st.title('Politicians Grouped by Arguments Used and Sentiment Score')
 # Step 9: Plot the data with clusters using Plotly
 fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters',
                 labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True})
 st.plotly_chart(fig)
 # Step 10: Visualize original arguments using Plotly
 fig = go.Figure()
 # Add arrows for loadings
 for argument in arguments:
    fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']],
                             mode='lines+text', text=[None, argument], textposition='top center',
                             line=dict(color='red', width=2), showlegend=False))
 # Add scatter plot for politicians
 fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers',
                         marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10),
                         text=df_arguments['name'], hoverinfo='text'))
 fig.update_layout(title='PCA Biplot of Politicians and Arguments',
                  xaxis_title='PCA Component 1',
                  yaxis_title='PCA Component 2',
                  showlegend=False)
 st.plotly_chart(fig)