Add code to download and process review references

This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future.
main
lasseedfast 1 year ago
parent 08e17d13a5
commit 911b8c33b0
  1. 6
      _arango.py
  2. 14
      _chromadb.py
  3. 26
      _llm.py
  4. 0
      all_arguments.py
  5. 2
      analyze_speeches.py
  6. 23
      chatbot.py
  7. 15
      clean_csv.py
  8. 149
      create_chroma.py
  9. 19
      dbx_test.py
  10. 13
      dl_article_libgen.py
  11. 26
      dl_elsy.py
  12. 55
      explore_speakers.py
  13. 108
      get_article_info.py
  14. 47
      group_parties.py
  15. 80
      group_speakers_streamlit.py

@ -1,4 +1,4 @@
import re
from arango import ArangoClient from arango import ArangoClient
from dotenv import load_dotenv from dotenv import load_dotenv
import os import os
@ -53,3 +53,7 @@ class ArangoDB:
if '/' in document_id: if '/' in document_id:
document_id = document_id.split('/')[-1] document_id = document_id.split('/')[-1]
return self.db.collection('ev_speeches').get(document_id) return self.db.collection('ev_speeches').get(document_id)
def fix_key(self, _key):
return re.sub(r'[^A-Za-z0-9_\-\.@()+=;$!*\'%:]', '_', _key)

@ -0,0 +1,14 @@
import chromadb
import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
class ChromaDB:
def __init__(self):
self.db = chromadb.PersistentClient("chroma_db")
max_characters = 2200
self.ts = MarkdownSplitter(max_characters)
self.sci_articles = self.db.get_or_create_collection("sci_articles")

@ -0,0 +1,26 @@
from ollama import Client
import os
import env_manager
env_manager.set_env()
class LLM:
def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None:
self.llm_model = os.getenv("LLM_MODEL")
self.system_message = system_message
self.options = {"temperature": temperature, "num_ctx": num_ctx}
self.messages = [{'role': 'system', 'content': self.system_message}]
self.chat = chat
self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}')
def generate(self, prompt: str) -> str:
self.messages.append({"role": "user", "content": prompt})
result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, )
answer = result['message']['content']
self.messages.append({"role": "assistant", "content": answer})
if not self.chat:
self.messages = [{'role': 'system', 'content': self.system_message}]
return answer

@ -2,7 +2,7 @@ from _llm import LLM
from collections import Counter from collections import Counter
from dotenv import load_dotenv from dotenv import load_dotenv
from _arango import ArangoDB from _arango import ArangoDB
from arguments import arguments as all_arguments from all_arguments import arguments as all_arguments
from colorprinter.print_color import * from colorprinter.print_color import *
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.cluster import KMeans from sklearn.cluster import KMeans

@ -0,0 +1,23 @@
from _llm import LLM
from _chromadb import ChromaDB
chromadb = ChromaDB()
llm = LLM(temperature=0.1)
while True:
user_input = input("Enter a prompt: ")
chunks = chromadb.sci_articles.query(query_texts=user_input)
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]])
prompt = f'''{user_input}
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information.
"""
{chunks_string}
"""
{user_input}
'''
response = llm.generate(prompt)
print(response)
print()

@ -0,0 +1,15 @@
import csv
# Preprocess the CSV file to ensure consistent field counts
input_file = 'speeches.csv'
output_file = 'cleaned_speeches.csv'
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
reader = csv.reader(infile, delimiter=';', quotechar='"')
writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in reader:
if len(row) == 22: # Ensure the row has the correct number of fields
writer.writerow(row)
print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'")

@ -0,0 +1,149 @@
import re
import chromadb
import os
import pymupdf4llm
from semantic_text_splitter import MarkdownSplitter
from _arango import ArangoDB
from pprint import pprint
import crossref_commons.retrieval as crossref
import ebooklib
from ebooklib import epub
import nltk
from bs4 import BeautifulSoup
# from epub_conversion.utils import open_book, convert_epub_to_lines
def get_crossref(doi):
try:
work = crossref.get_publication_as_json(doi)
# Determine the best publication date
if "published-print" in work:
publication_date = work["published-print"]["date-parts"][0]
elif "published-online" in work:
publication_date = work["published-online"]["date-parts"][0]
elif "issued" in work:
publication_date = work["issued"]["date-parts"][0]
else:
publication_date = [None]
metadata = {
"doi": work.get("DOI", None),
"title": work.get("title", [None])[
0
], # Extract the first title if available
"authors": [
f"{author['given']} {author['family']}"
for author in work.get("author", [])
],
"abstract": work.get("abstract", None),
"journal": work.get("container-title", [None])[
0
], # Extract the first journal title if available
"volume": work.get("volume", None),
"issue": work.get("issue", None),
"pages": work.get("page", None),
"published_date": "-".join(
map(str, publication_date)
), # Join date parts with hyphens
"url_doi": work.get("URL", None),
"link": (
work.get("link", [None])[0]["URL"] if work.get("link", None) else None
),
"language": work.get("language", None),
}
return metadata
except Exception as e:
print(f"Error retrieving metadata for DOI {doi}: {e}")
return None
arango = ArangoDB()
arango.db.collection("sci_articles").truncate() #!
# Initialize the chroma database
db = chromadb.PersistentClient("chroma_db")
col = db.get_or_create_collection("articles")
db.delete_collection("articles") #!
col = db.get_or_create_collection("articles")
max_characters = 2200
ts = MarkdownSplitter(max_characters)
def add_pdfs(path_folder):
pdf_in_folder = []
for file in os.listdir(path_folder):
if file.endswith(".pdf"):
pdf_in_folder.append(file)
for pdf in pdf_in_folder:
doi = pdf.strip(".pdf").replace("_", "/")
crossref_info = get_crossref(doi)
if arango.db.collection("sci_articles").get(arango.fix_key(doi)):
print(f"Article {doi} already in database")
continue
pdf_path = os.path.join("sci_articles", pdf)
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True)
md_text = ""
for page in md_pages:
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n"
ids = []
documents = []
metadatas = []
better_chunks = []
chunks = ts.chunks(md_text)
# Merge chunks that are too short
for chunk in chunks:
if all(
[
len(chunk) < int(max_characters / 3), # TODO Are those values good?
len(chunks[-1]) < int(max_characters * 1.5),
len(better_chunks) > 0,
]
):
better_chunks[-1] += chunk
else:
better_chunks.append(chunks)
arango_chunks = []
last_page = 1
for i, chunk in enumerate(chunks):
page_numbers = re.findall(r"@(\d+)@", chunk)
if page_numbers == []:
page_numbers = [last_page]
else:
last_page = page_numbers[-1]
id = arango.fix_key(doi) + f"_{i}"
ids.append(id)
metadatas.append(
{
"doi": pdf.strip(".pdf"),
"file": pdf_path,
"chunk_nr": i,
"pages": ",".join([str(i) for i in page_numbers]),
}
)
chunk = re.sub(r"@(\d+)@", "", chunk)
documents.append(chunk)
arango_chunks.append({"text": chunk, "pages": page_numbers})
col.add(ids=ids, documents=documents, metadatas=metadatas)
arango_document = {
"_key": arango.fix_key(doi),
"doi": doi,
"file": pdf_path,
"chunks": arango_chunks,
"text": md_text,
"metadata": crossref_info,
}
arango.db.collection("sci_articles").insert(
arango_document, overwrite=True, overwrite_mode="update"
)
print(f"Inserted article {doi} into database")
path_folder = "sci_articles"
add_pdfs(path_folder)

@ -0,0 +1,19 @@
import dropbox
# Replace with your access token
ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc'
# Initialize a Dropbox client
dbx = dropbox.Dropbox(ACCESS_TOKEN)
# Define the folder path
folder_path = '/Filinlämningar/Electric Cars'
# List all files in the root directory
try:
result = dbx.files_list_folder(folder_path)
print(f"Files in the root directory:")
for entry in result.entries:
print(entry.name)
except dropbox.exceptions.ApiError as err:
print(f"Failed to list folder contents: {err}")

@ -0,0 +1,13 @@
import pyperclip
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
for ref in references:
print(ref)
# Copy ref to clipboard
found = input("Found DOI? (y/n): ")
f2.write(f"{ref.strip()}: {found}\n")

@ -0,0 +1,26 @@
"""An example program that uses the elsapy module"""
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()
## Initialize client
client = ElsClient(config['apikey'])
def get_doc(doi):
## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = doi)
if doi_doc.read(client):
print ("doi_doc.title: ", doi_doc.title)
doi_doc.write()
return doi
else:
return None

@ -0,0 +1,55 @@
import pandas as pd
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
# Step 1: Read the CSV file
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
print(df.head())
# Step 2: Extract relevant columns
# Assuming the arguments start from the 5th column onwards
arguments = df.columns[5:]
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
# Step 3: Create a binary matrix for arguments
# Convert the argument columns to integers
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Step 4: Calculate sentiment scores for each politician
def calculate_sentiment_score(row):
score = 0
for arg in arguments:
if row[arg] > 0:
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
if sentiment == 'positive':
score += 1
elif sentiment == 'negative':
score -= 1
return score
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
# Step 5: Identify the top 3 most positive and negative politicians
top_3_positive = df_arguments.nlargest(3, 'sentiment_score')
top_3_negative = df_arguments.nsmallest(3, 'sentiment_score')
# Step 6: Extract arguments used by these politicians
def extract_arguments(df):
result = {}
for _, row in df.iterrows():
name = row['name']
used_arguments = [arg for arg in arguments if row[arg] > 0]
result[name] = used_arguments
return result
positive_arguments = extract_arguments(top_3_positive)
negative_arguments = extract_arguments(top_3_negative)
# Print the results
print("Top 3 Positive Politicians and their Arguments:")
for name, args in positive_arguments.items():
print(f"{name}: {args}")
print("\nTop 3 Negative Politicians and their Arguments:")
for name, args in negative_arguments.items():
print(f"{name}: {args}")

@ -0,0 +1,108 @@
import pyperclip
from pprint import pprint
import requests
import crossref_commons.retrieval
from time import sleep
from bs4 import BeautifulSoup
import dl_elsy
def download_file(doi, url):
try:
response = requests.get(url)
response.raise_for_status() # Check if the request was successful
content_type = response.headers['Content-Type']
if content_type == 'application/pdf':
file_extension = 'pdf'
elif content_type.startswith('text/'):
file_extension = 'md'
else:
print(f"Unsupported content type: {content_type} for DOI: {doi}")
return
file_name = f"{doi}.{file_extension}".replace('/', '_')
if file_extension == 'md':
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.text)
exit()
with open(file_name, 'wb') as f:
f.write(response.content)
print(f"Downloaded {file_extension.upper()} for DOI: {doi}")
except requests.exceptions.RequestException as e:
print(f"Failed to download file for DOI: {doi}. Error: {e}")
def get_article_info(doi):
url = f'https://doaj.org/api/search/articles/{doi}'
response = requests.get(url)
if response.status_code == 200:
data = response.json()
for result in data.get('results', []):
for link in result.get('bibjson', {}).get('link', []):
if 'mdpi.com' in link['url']:
r = requests.get(link['url'])
soup = BeautifulSoup(r.content, 'html.parser')
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'})
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href']
pdf = requests.get(pdf_url)
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f:
f.write(pdf.content)
sleep(1)
epub = requests.get(link['url'] + '/epub')
with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f:
f.write(epub.content)
sleep(1)
print(f'Downloaded PDF and EPUB for {doi}')
elif 'sciencedirect.com' in link['url']:
return dl_elsy.get_doc(doi)
sleep(1)
else:
print(link['url'])
input()
return doi
else:
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}")
# Read DOIs from file
with open('review_references.csv', 'r') as f:
with open('review_references.txt', 'w') as f2:
references = f.readlines()
# Process each DOI
with open('review_references.txt') as f2:
ref_done = f2.readlines()
for ref in references:
doi = ref.strip()
print('###', ref.upper())
try:
cr = crossref_commons.retrieval.get_publication_as_json(doi)
except ValueError:
print(f"Error fetching metadata for DOI: {doi}")
continue
if 'sciencedirect.com' not in str(cr):
continue
if doi not in ref_done:
sleep(1)
r = dl_elsy.get_doc(doi)
if r:
with open('review_references.txt', 'a+') as f2:
f2.write(f'{r}\n')
exit()
for ref in references:
doi = ref.strip()
with open('review_references.txt', 'a') as f2:
r = get_article_info(doi)
if r:
f2.write(r)

@ -0,0 +1,47 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
# Step 1: Read the CSV file
df = pd.read_csv('speeches.csv', delimiter=';')
# Step 2: Extract relevant columns
# Assuming the arguments start from the 5th column onwards
arguments = df.columns[5:]
df_arguments = df[['_key', 'name'] + list(arguments)]
# Step 3: Create a binary matrix for arguments
# Convert the argument columns to integers
df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Step 4: Sum the arguments for each politician
df_sum = df_arguments.groupby('name')[arguments].sum().reset_index()
# Step 5: Plot the data
plt.figure(figsize=(12, 8))
sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True)
plt.title('Arguments Used by Politicians')
plt.xlabel('Arguments')
plt.ylabel('Politicians')
# Step 6: Color the x-axis labels based on sentiment
ax = plt.gca()
x_labels = ax.get_xticklabels()
for label in x_labels:
argument = label.get_text()
sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral')
if sentiment == 'positive':
label.set_color('green')
elif sentiment == 'negative':
label.set_color('red')
else:
label.set_color('black')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
# Save the plot instead of showing it
plt.savefig('arguments_used_by_politicians.png')
plt.close()

@ -0,0 +1,80 @@
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import streamlit as st
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
# Step 1: Read the CSV file
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
# Step 2: Extract relevant columns
# Assuming the arguments start from the 5th column onwards
arguments = df.columns[5:]
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
# Step 3: Create a binary matrix for arguments
# Convert the argument columns to integers
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Step 4: Calculate sentiment scores for each politician
def calculate_sentiment_score(row):
score = 0
for arg in arguments:
if row[arg] > 0:
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
if sentiment == 'positive':
score += 1
elif sentiment == 'negative':
score -= 1
return score
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
# # Step 5: Standardize the data
# scaler = StandardScaler()
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments])
# Step 6: Dimensionality reduction using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_arguments[arguments])
df_arguments['pca1'] = pca_result[:, 0]
df_arguments['pca2'] = pca_result[:, 1]
# Step 7: Examine loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments)
# Step 8: Perform clustering
kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed
df_arguments['cluster'] = kmeans.fit_predict(pca_result)
# Streamlit app
st.title('Politicians Grouped by Arguments Used and Sentiment Score')
# Step 9: Plot the data with clusters using Plotly
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters',
labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True})
st.plotly_chart(fig)
# Step 10: Visualize original arguments using Plotly
fig = go.Figure()
# Add arrows for loadings
for argument in arguments:
fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']],
mode='lines+text', text=[None, argument], textposition='top center',
line=dict(color='red', width=2), showlegend=False))
# Add scatter plot for politicians
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers',
marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10),
text=df_arguments['name'], hoverinfo='text'))
fig.update_layout(title='PCA Biplot of Politicians and Arguments',
xaxis_title='PCA Component 1',
yaxis_title='PCA Component 2',
showlegend=False)
st.plotly_chart(fig)
Loading…
Cancel
Save