This commit adds code to download review references from a CSV file and process them. The references are read from a file called 'review_references.csv' and each reference is checked for a DOI. The user is prompted to confirm if a DOI is found, and the reference along with the confirmation is written to a new file called 'review_references.txt'. This code will be useful for managing review references in the future.main
parent
08e17d13a5
commit
911b8c33b0
15 changed files with 581 additions and 2 deletions
@ -0,0 +1,14 @@ |
|||||||
|
import chromadb |
||||||
|
import os |
||||||
|
import pymupdf4llm |
||||||
|
from semantic_text_splitter import MarkdownSplitter |
||||||
|
from _arango import ArangoDB |
||||||
|
from pprint import pprint |
||||||
|
|
||||||
|
class ChromaDB: |
||||||
|
def __init__(self): |
||||||
|
self.db = chromadb.PersistentClient("chroma_db") |
||||||
|
max_characters = 2200 |
||||||
|
self.ts = MarkdownSplitter(max_characters) |
||||||
|
self.sci_articles = self.db.get_or_create_collection("sci_articles") |
||||||
|
|
||||||
@ -0,0 +1,26 @@ |
|||||||
|
from ollama import Client |
||||||
|
import os |
||||||
|
import env_manager |
||||||
|
env_manager.set_env() |
||||||
|
|
||||||
|
class LLM: |
||||||
|
def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None: |
||||||
|
self.llm_model = os.getenv("LLM_MODEL") |
||||||
|
self.system_message = system_message |
||||||
|
self.options = {"temperature": temperature, "num_ctx": num_ctx} |
||||||
|
self.messages = [{'role': 'system', 'content': self.system_message}] |
||||||
|
self.chat = chat |
||||||
|
self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}') |
||||||
|
|
||||||
|
def generate(self, prompt: str) -> str: |
||||||
|
self.messages.append({"role": "user", "content": prompt}) |
||||||
|
|
||||||
|
result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, ) |
||||||
|
|
||||||
|
answer = result['message']['content'] |
||||||
|
self.messages.append({"role": "assistant", "content": answer}) |
||||||
|
if not self.chat: |
||||||
|
self.messages = [{'role': 'system', 'content': self.system_message}] |
||||||
|
|
||||||
|
return answer |
||||||
|
|
||||||
@ -0,0 +1,23 @@ |
|||||||
|
from _llm import LLM |
||||||
|
from _chromadb import ChromaDB |
||||||
|
|
||||||
|
chromadb = ChromaDB() |
||||||
|
llm = LLM(temperature=0.1) |
||||||
|
|
||||||
|
while True: |
||||||
|
user_input = input("Enter a prompt: ") |
||||||
|
chunks = chromadb.sci_articles.query(query_texts=user_input) |
||||||
|
chunks_string = "\n".join([chunk["text"] for chunk in chunks['documents'][0]]) |
||||||
|
prompt = f'''{user_input} |
||||||
|
Below are snippets from different articles. ONLY use the information below to answer the question. Do not use any other information. |
||||||
|
|
||||||
|
""" |
||||||
|
{chunks_string} |
||||||
|
""" |
||||||
|
|
||||||
|
{user_input} |
||||||
|
|
||||||
|
''' |
||||||
|
response = llm.generate(prompt) |
||||||
|
print(response) |
||||||
|
print() |
||||||
@ -0,0 +1,15 @@ |
|||||||
|
import csv |
||||||
|
|
||||||
|
# Preprocess the CSV file to ensure consistent field counts |
||||||
|
input_file = 'speeches.csv' |
||||||
|
output_file = 'cleaned_speeches.csv' |
||||||
|
|
||||||
|
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile: |
||||||
|
reader = csv.reader(infile, delimiter=';', quotechar='"') |
||||||
|
writer = csv.writer(outfile, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) |
||||||
|
|
||||||
|
for row in reader: |
||||||
|
if len(row) == 22: # Ensure the row has the correct number of fields |
||||||
|
writer.writerow(row) |
||||||
|
|
||||||
|
print("CSV file has been cleaned and saved as 'cleaned_speeches.csv'") |
||||||
@ -0,0 +1,149 @@ |
|||||||
|
import re |
||||||
|
import chromadb |
||||||
|
import os |
||||||
|
import pymupdf4llm |
||||||
|
from semantic_text_splitter import MarkdownSplitter |
||||||
|
from _arango import ArangoDB |
||||||
|
from pprint import pprint |
||||||
|
import crossref_commons.retrieval as crossref |
||||||
|
import ebooklib |
||||||
|
from ebooklib import epub |
||||||
|
import nltk |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
|
||||||
|
# from epub_conversion.utils import open_book, convert_epub_to_lines |
||||||
|
|
||||||
|
|
||||||
|
def get_crossref(doi): |
||||||
|
try: |
||||||
|
work = crossref.get_publication_as_json(doi) |
||||||
|
|
||||||
|
# Determine the best publication date |
||||||
|
if "published-print" in work: |
||||||
|
publication_date = work["published-print"]["date-parts"][0] |
||||||
|
elif "published-online" in work: |
||||||
|
publication_date = work["published-online"]["date-parts"][0] |
||||||
|
elif "issued" in work: |
||||||
|
publication_date = work["issued"]["date-parts"][0] |
||||||
|
else: |
||||||
|
publication_date = [None] |
||||||
|
|
||||||
|
metadata = { |
||||||
|
"doi": work.get("DOI", None), |
||||||
|
"title": work.get("title", [None])[ |
||||||
|
0 |
||||||
|
], # Extract the first title if available |
||||||
|
"authors": [ |
||||||
|
f"{author['given']} {author['family']}" |
||||||
|
for author in work.get("author", []) |
||||||
|
], |
||||||
|
"abstract": work.get("abstract", None), |
||||||
|
"journal": work.get("container-title", [None])[ |
||||||
|
0 |
||||||
|
], # Extract the first journal title if available |
||||||
|
"volume": work.get("volume", None), |
||||||
|
"issue": work.get("issue", None), |
||||||
|
"pages": work.get("page", None), |
||||||
|
"published_date": "-".join( |
||||||
|
map(str, publication_date) |
||||||
|
), # Join date parts with hyphens |
||||||
|
"url_doi": work.get("URL", None), |
||||||
|
"link": ( |
||||||
|
work.get("link", [None])[0]["URL"] if work.get("link", None) else None |
||||||
|
), |
||||||
|
"language": work.get("language", None), |
||||||
|
} |
||||||
|
return metadata |
||||||
|
except Exception as e: |
||||||
|
print(f"Error retrieving metadata for DOI {doi}: {e}") |
||||||
|
return None |
||||||
|
|
||||||
|
|
||||||
|
arango = ArangoDB() |
||||||
|
arango.db.collection("sci_articles").truncate() #! |
||||||
|
|
||||||
|
# Initialize the chroma database |
||||||
|
db = chromadb.PersistentClient("chroma_db") |
||||||
|
col = db.get_or_create_collection("articles") |
||||||
|
db.delete_collection("articles") #! |
||||||
|
col = db.get_or_create_collection("articles") |
||||||
|
max_characters = 2200 |
||||||
|
ts = MarkdownSplitter(max_characters) |
||||||
|
|
||||||
|
|
||||||
|
def add_pdfs(path_folder): |
||||||
|
pdf_in_folder = [] |
||||||
|
for file in os.listdir(path_folder): |
||||||
|
if file.endswith(".pdf"): |
||||||
|
pdf_in_folder.append(file) |
||||||
|
|
||||||
|
for pdf in pdf_in_folder: |
||||||
|
doi = pdf.strip(".pdf").replace("_", "/") |
||||||
|
crossref_info = get_crossref(doi) |
||||||
|
|
||||||
|
if arango.db.collection("sci_articles").get(arango.fix_key(doi)): |
||||||
|
print(f"Article {doi} already in database") |
||||||
|
continue |
||||||
|
pdf_path = os.path.join("sci_articles", pdf) |
||||||
|
md_pages = pymupdf4llm.to_markdown(pdf_path, page_chunks=True) |
||||||
|
|
||||||
|
md_text = "" |
||||||
|
for page in md_pages: |
||||||
|
md_text += f"{page['text']}\n@{page['metadata']['page']}@\n" |
||||||
|
|
||||||
|
ids = [] |
||||||
|
documents = [] |
||||||
|
metadatas = [] |
||||||
|
better_chunks = [] |
||||||
|
chunks = ts.chunks(md_text) |
||||||
|
|
||||||
|
# Merge chunks that are too short |
||||||
|
for chunk in chunks: |
||||||
|
if all( |
||||||
|
[ |
||||||
|
len(chunk) < int(max_characters / 3), # TODO Are those values good? |
||||||
|
len(chunks[-1]) < int(max_characters * 1.5), |
||||||
|
len(better_chunks) > 0, |
||||||
|
] |
||||||
|
): |
||||||
|
better_chunks[-1] += chunk |
||||||
|
else: |
||||||
|
better_chunks.append(chunks) |
||||||
|
arango_chunks = [] |
||||||
|
last_page = 1 |
||||||
|
for i, chunk in enumerate(chunks): |
||||||
|
page_numbers = re.findall(r"@(\d+)@", chunk) |
||||||
|
if page_numbers == []: |
||||||
|
page_numbers = [last_page] |
||||||
|
else: |
||||||
|
last_page = page_numbers[-1] |
||||||
|
id = arango.fix_key(doi) + f"_{i}" |
||||||
|
ids.append(id) |
||||||
|
metadatas.append( |
||||||
|
{ |
||||||
|
"doi": pdf.strip(".pdf"), |
||||||
|
"file": pdf_path, |
||||||
|
"chunk_nr": i, |
||||||
|
"pages": ",".join([str(i) for i in page_numbers]), |
||||||
|
} |
||||||
|
) |
||||||
|
chunk = re.sub(r"@(\d+)@", "", chunk) |
||||||
|
documents.append(chunk) |
||||||
|
arango_chunks.append({"text": chunk, "pages": page_numbers}) |
||||||
|
col.add(ids=ids, documents=documents, metadatas=metadatas) |
||||||
|
arango_document = { |
||||||
|
"_key": arango.fix_key(doi), |
||||||
|
"doi": doi, |
||||||
|
"file": pdf_path, |
||||||
|
"chunks": arango_chunks, |
||||||
|
"text": md_text, |
||||||
|
"metadata": crossref_info, |
||||||
|
} |
||||||
|
arango.db.collection("sci_articles").insert( |
||||||
|
arango_document, overwrite=True, overwrite_mode="update" |
||||||
|
) |
||||||
|
print(f"Inserted article {doi} into database") |
||||||
|
|
||||||
|
|
||||||
|
path_folder = "sci_articles" |
||||||
|
add_pdfs(path_folder) |
||||||
@ -0,0 +1,19 @@ |
|||||||
|
import dropbox |
||||||
|
|
||||||
|
# Replace with your access token |
||||||
|
ACCESS_TOKEN = 'sl.B-hTaHGCpioPzyC_BVCulhgIP3xTfpTcEgaPwkpzu00j3rgA7Q-9Durd2S1TnA5yqiS_ucn4YcDdyG_VFxropLZiyVPhxd4MiIHpFItugn9DCoMjtiy3Y8lJ6iD2I1A7DAhjlTavVUnxNTc' |
||||||
|
|
||||||
|
# Initialize a Dropbox client |
||||||
|
dbx = dropbox.Dropbox(ACCESS_TOKEN) |
||||||
|
|
||||||
|
# Define the folder path |
||||||
|
folder_path = '/Filinlämningar/Electric Cars' |
||||||
|
|
||||||
|
# List all files in the root directory |
||||||
|
try: |
||||||
|
result = dbx.files_list_folder(folder_path) |
||||||
|
print(f"Files in the root directory:") |
||||||
|
for entry in result.entries: |
||||||
|
print(entry.name) |
||||||
|
except dropbox.exceptions.ApiError as err: |
||||||
|
print(f"Failed to list folder contents: {err}") |
||||||
@ -0,0 +1,13 @@ |
|||||||
|
import pyperclip |
||||||
|
|
||||||
|
with open('review_references.csv', 'r') as f: |
||||||
|
with open('review_references.txt', 'w') as f2: |
||||||
|
references = f.readlines() |
||||||
|
for ref in references: |
||||||
|
print(ref) |
||||||
|
# Copy ref to clipboard |
||||||
|
found = input("Found DOI? (y/n): ") |
||||||
|
f2.write(f"{ref.strip()}: {found}\n") |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,26 @@ |
|||||||
|
"""An example program that uses the elsapy module""" |
||||||
|
|
||||||
|
from elsapy.elsclient import ElsClient |
||||||
|
from elsapy.elsprofile import ElsAuthor, ElsAffil |
||||||
|
from elsapy.elsdoc import FullDoc, AbsDoc |
||||||
|
from elsapy.elssearch import ElsSearch |
||||||
|
import json |
||||||
|
|
||||||
|
## Load configuration |
||||||
|
con_file = open("config.json") |
||||||
|
config = json.load(con_file) |
||||||
|
con_file.close() |
||||||
|
|
||||||
|
## Initialize client |
||||||
|
client = ElsClient(config['apikey']) |
||||||
|
|
||||||
|
def get_doc(doi): |
||||||
|
## ScienceDirect (full-text) document example using DOI |
||||||
|
doi_doc = FullDoc(doi = doi) |
||||||
|
if doi_doc.read(client): |
||||||
|
print ("doi_doc.title: ", doi_doc.title) |
||||||
|
doi_doc.write() |
||||||
|
return doi |
||||||
|
else: |
||||||
|
return None |
||||||
|
|
||||||
@ -0,0 +1,55 @@ |
|||||||
|
import pandas as pd |
||||||
|
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||||
|
|
||||||
|
# Step 1: Read the CSV file |
||||||
|
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') |
||||||
|
|
||||||
|
print(df.head()) |
||||||
|
|
||||||
|
# Step 2: Extract relevant columns |
||||||
|
# Assuming the arguments start from the 5th column onwards |
||||||
|
arguments = df.columns[5:] |
||||||
|
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] |
||||||
|
|
||||||
|
# Step 3: Create a binary matrix for arguments |
||||||
|
# Convert the argument columns to integers |
||||||
|
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||||
|
|
||||||
|
# Step 4: Calculate sentiment scores for each politician |
||||||
|
def calculate_sentiment_score(row): |
||||||
|
score = 0 |
||||||
|
for arg in arguments: |
||||||
|
if row[arg] > 0: |
||||||
|
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') |
||||||
|
if sentiment == 'positive': |
||||||
|
score += 1 |
||||||
|
elif sentiment == 'negative': |
||||||
|
score -= 1 |
||||||
|
return score |
||||||
|
|
||||||
|
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) |
||||||
|
|
||||||
|
# Step 5: Identify the top 3 most positive and negative politicians |
||||||
|
top_3_positive = df_arguments.nlargest(3, 'sentiment_score') |
||||||
|
top_3_negative = df_arguments.nsmallest(3, 'sentiment_score') |
||||||
|
|
||||||
|
# Step 6: Extract arguments used by these politicians |
||||||
|
def extract_arguments(df): |
||||||
|
result = {} |
||||||
|
for _, row in df.iterrows(): |
||||||
|
name = row['name'] |
||||||
|
used_arguments = [arg for arg in arguments if row[arg] > 0] |
||||||
|
result[name] = used_arguments |
||||||
|
return result |
||||||
|
|
||||||
|
positive_arguments = extract_arguments(top_3_positive) |
||||||
|
negative_arguments = extract_arguments(top_3_negative) |
||||||
|
|
||||||
|
# Print the results |
||||||
|
print("Top 3 Positive Politicians and their Arguments:") |
||||||
|
for name, args in positive_arguments.items(): |
||||||
|
print(f"{name}: {args}") |
||||||
|
|
||||||
|
print("\nTop 3 Negative Politicians and their Arguments:") |
||||||
|
for name, args in negative_arguments.items(): |
||||||
|
print(f"{name}: {args}") |
||||||
@ -0,0 +1,108 @@ |
|||||||
|
import pyperclip |
||||||
|
from pprint import pprint |
||||||
|
import requests |
||||||
|
import crossref_commons.retrieval |
||||||
|
from time import sleep |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
import dl_elsy |
||||||
|
|
||||||
|
def download_file(doi, url): |
||||||
|
try: |
||||||
|
response = requests.get(url) |
||||||
|
response.raise_for_status() # Check if the request was successful |
||||||
|
content_type = response.headers['Content-Type'] |
||||||
|
|
||||||
|
if content_type == 'application/pdf': |
||||||
|
file_extension = 'pdf' |
||||||
|
elif content_type.startswith('text/'): |
||||||
|
file_extension = 'md' |
||||||
|
else: |
||||||
|
print(f"Unsupported content type: {content_type} for DOI: {doi}") |
||||||
|
return |
||||||
|
|
||||||
|
file_name = f"{doi}.{file_extension}".replace('/', '_') |
||||||
|
|
||||||
|
if file_extension == 'md': |
||||||
|
soup = BeautifulSoup(response.content, 'html.parser') |
||||||
|
print(soup.text) |
||||||
|
exit() |
||||||
|
|
||||||
|
with open(file_name, 'wb') as f: |
||||||
|
f.write(response.content) |
||||||
|
print(f"Downloaded {file_extension.upper()} for DOI: {doi}") |
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e: |
||||||
|
print(f"Failed to download file for DOI: {doi}. Error: {e}") |
||||||
|
|
||||||
|
def get_article_info(doi): |
||||||
|
url = f'https://doaj.org/api/search/articles/{doi}' |
||||||
|
response = requests.get(url) |
||||||
|
|
||||||
|
if response.status_code == 200: |
||||||
|
data = response.json() |
||||||
|
for result in data.get('results', []): |
||||||
|
for link in result.get('bibjson', {}).get('link', []): |
||||||
|
if 'mdpi.com' in link['url']: |
||||||
|
r = requests.get(link['url']) |
||||||
|
soup = BeautifulSoup(r.content, 'html.parser') |
||||||
|
pdf_link_html = soup.find('a', {'class':'UD_ArticlePDF'}) |
||||||
|
pdf_url = 'https://www.mdpi.com' + pdf_link_html['href'] |
||||||
|
pdf = requests.get(pdf_url) |
||||||
|
with open(f'{doi}.pdf'.replace('/', '_'), 'wb') as f: |
||||||
|
f.write(pdf.content) |
||||||
|
sleep(1) |
||||||
|
epub = requests.get(link['url'] + '/epub') |
||||||
|
with open(f'{doi}.epub'.replace('/', '_'), 'wb') as f: |
||||||
|
f.write(epub.content) |
||||||
|
sleep(1) |
||||||
|
print(f'Downloaded PDF and EPUB for {doi}') |
||||||
|
elif 'sciencedirect.com' in link['url']: |
||||||
|
return dl_elsy.get_doc(doi) |
||||||
|
sleep(1) |
||||||
|
else: |
||||||
|
|
||||||
|
|
||||||
|
print(link['url']) |
||||||
|
input() |
||||||
|
return doi |
||||||
|
|
||||||
|
else: |
||||||
|
print(f"Error fetching metadata for DOI: {doi}. HTTP Status Code: {response.status_code}") |
||||||
|
|
||||||
|
# Read DOIs from file |
||||||
|
|
||||||
|
with open('review_references.csv', 'r') as f: |
||||||
|
with open('review_references.txt', 'w') as f2: |
||||||
|
references = f.readlines() |
||||||
|
# Process each DOI |
||||||
|
with open('review_references.txt') as f2: |
||||||
|
ref_done = f2.readlines() |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for ref in references: |
||||||
|
doi = ref.strip() |
||||||
|
print('###', ref.upper()) |
||||||
|
try: |
||||||
|
cr = crossref_commons.retrieval.get_publication_as_json(doi) |
||||||
|
except ValueError: |
||||||
|
print(f"Error fetching metadata for DOI: {doi}") |
||||||
|
continue |
||||||
|
if 'sciencedirect.com' not in str(cr): |
||||||
|
continue |
||||||
|
|
||||||
|
if doi not in ref_done: |
||||||
|
sleep(1) |
||||||
|
r = dl_elsy.get_doc(doi) |
||||||
|
if r: |
||||||
|
with open('review_references.txt', 'a+') as f2: |
||||||
|
f2.write(f'{r}\n') |
||||||
|
|
||||||
|
exit() |
||||||
|
for ref in references: |
||||||
|
doi = ref.strip() |
||||||
|
with open('review_references.txt', 'a') as f2: |
||||||
|
|
||||||
|
r = get_article_info(doi) |
||||||
|
if r: |
||||||
|
f2.write(r) |
||||||
@ -0,0 +1,47 @@ |
|||||||
|
import pandas as pd |
||||||
|
import matplotlib.pyplot as plt |
||||||
|
import seaborn as sns |
||||||
|
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||||
|
|
||||||
|
|
||||||
|
# Step 1: Read the CSV file |
||||||
|
df = pd.read_csv('speeches.csv', delimiter=';') |
||||||
|
|
||||||
|
# Step 2: Extract relevant columns |
||||||
|
# Assuming the arguments start from the 5th column onwards |
||||||
|
arguments = df.columns[5:] |
||||||
|
df_arguments = df[['_key', 'name'] + list(arguments)] |
||||||
|
|
||||||
|
# Step 3: Create a binary matrix for arguments |
||||||
|
# Convert the argument columns to integers |
||||||
|
df_arguments[arguments] = df_arguments[arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||||
|
|
||||||
|
# Step 4: Sum the arguments for each politician |
||||||
|
df_sum = df_arguments.groupby('name')[arguments].sum().reset_index() |
||||||
|
|
||||||
|
# Step 5: Plot the data |
||||||
|
plt.figure(figsize=(12, 8)) |
||||||
|
sns.heatmap(df_sum.set_index('name'), annot=True, cmap='coolwarm', cbar=True) |
||||||
|
plt.title('Arguments Used by Politicians') |
||||||
|
plt.xlabel('Arguments') |
||||||
|
plt.ylabel('Politicians') |
||||||
|
|
||||||
|
# Step 6: Color the x-axis labels based on sentiment |
||||||
|
ax = plt.gca() |
||||||
|
x_labels = ax.get_xticklabels() |
||||||
|
for label in x_labels: |
||||||
|
argument = label.get_text() |
||||||
|
sentiment = arguments_dict.get(argument, {}).get('sentiment', 'neutral') |
||||||
|
if sentiment == 'positive': |
||||||
|
label.set_color('green') |
||||||
|
elif sentiment == 'negative': |
||||||
|
label.set_color('red') |
||||||
|
else: |
||||||
|
label.set_color('black') |
||||||
|
|
||||||
|
plt.xticks(rotation=45, ha='right') |
||||||
|
plt.tight_layout() |
||||||
|
|
||||||
|
# Save the plot instead of showing it |
||||||
|
plt.savefig('arguments_used_by_politicians.png') |
||||||
|
plt.close() |
||||||
@ -0,0 +1,80 @@ |
|||||||
|
import pandas as pd |
||||||
|
import plotly.express as px |
||||||
|
import plotly.graph_objects as go |
||||||
|
from sklearn.decomposition import PCA |
||||||
|
from sklearn.cluster import KMeans |
||||||
|
from sklearn.preprocessing import StandardScaler |
||||||
|
import streamlit as st |
||||||
|
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
||||||
|
|
||||||
|
# Step 1: Read the CSV file |
||||||
|
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') |
||||||
|
|
||||||
|
# Step 2: Extract relevant columns |
||||||
|
# Assuming the arguments start from the 5th column onwards |
||||||
|
arguments = df.columns[5:] |
||||||
|
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] |
||||||
|
|
||||||
|
# Step 3: Create a binary matrix for arguments |
||||||
|
# Convert the argument columns to integers |
||||||
|
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
||||||
|
|
||||||
|
# Step 4: Calculate sentiment scores for each politician |
||||||
|
def calculate_sentiment_score(row): |
||||||
|
score = 0 |
||||||
|
for arg in arguments: |
||||||
|
if row[arg] > 0: |
||||||
|
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') |
||||||
|
if sentiment == 'positive': |
||||||
|
score += 1 |
||||||
|
elif sentiment == 'negative': |
||||||
|
score -= 1 |
||||||
|
return score |
||||||
|
|
||||||
|
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) |
||||||
|
|
||||||
|
# # Step 5: Standardize the data |
||||||
|
# scaler = StandardScaler() |
||||||
|
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments]) |
||||||
|
|
||||||
|
# Step 6: Dimensionality reduction using PCA |
||||||
|
pca = PCA(n_components=2) |
||||||
|
pca_result = pca.fit_transform(df_arguments[arguments]) |
||||||
|
df_arguments['pca1'] = pca_result[:, 0] |
||||||
|
df_arguments['pca2'] = pca_result[:, 1] |
||||||
|
|
||||||
|
# Step 7: Examine loadings |
||||||
|
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments) |
||||||
|
|
||||||
|
# Step 8: Perform clustering |
||||||
|
kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed |
||||||
|
df_arguments['cluster'] = kmeans.fit_predict(pca_result) |
||||||
|
|
||||||
|
# Streamlit app |
||||||
|
st.title('Politicians Grouped by Arguments Used and Sentiment Score') |
||||||
|
|
||||||
|
# Step 9: Plot the data with clusters using Plotly |
||||||
|
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters', |
||||||
|
labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True}) |
||||||
|
st.plotly_chart(fig) |
||||||
|
|
||||||
|
# Step 10: Visualize original arguments using Plotly |
||||||
|
fig = go.Figure() |
||||||
|
|
||||||
|
# Add arrows for loadings |
||||||
|
for argument in arguments: |
||||||
|
fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']], |
||||||
|
mode='lines+text', text=[None, argument], textposition='top center', |
||||||
|
line=dict(color='red', width=2), showlegend=False)) |
||||||
|
|
||||||
|
# Add scatter plot for politicians |
||||||
|
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers', |
||||||
|
marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10), |
||||||
|
text=df_arguments['name'], hoverinfo='text')) |
||||||
|
|
||||||
|
fig.update_layout(title='PCA Biplot of Politicians and Arguments', |
||||||
|
xaxis_title='PCA Component 1', |
||||||
|
yaxis_title='PCA Component 2', |
||||||
|
showlegend=False) |
||||||
|
|
||||||
|
st.plotly_chart(fig) |
||||||
Loading…
Reference in new issue