You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

80 lines
3.2 KiB

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import streamlit as st
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information
# Step 1: Read the CSV file
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')
# Step 2: Extract relevant columns
# Assuming the arguments start from the 5th column onwards
arguments = df.columns[5:]
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]
# Step 3: Create a binary matrix for arguments
# Convert the argument columns to integers
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# Step 4: Calculate sentiment scores for each politician
def calculate_sentiment_score(row):
score = 0
for arg in arguments:
if row[arg] > 0:
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
if sentiment == 'positive':
score += 1
elif sentiment == 'negative':
score -= 1
return score
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)
# # Step 5: Standardize the data
# scaler = StandardScaler()
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments])
# Step 6: Dimensionality reduction using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_arguments[arguments])
df_arguments['pca1'] = pca_result[:, 0]
df_arguments['pca2'] = pca_result[:, 1]
# Step 7: Examine loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments)
# Step 8: Perform clustering
kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed
df_arguments['cluster'] = kmeans.fit_predict(pca_result)
# Streamlit app
st.title('Politicians Grouped by Arguments Used and Sentiment Score')
# Step 9: Plot the data with clusters using Plotly
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters',
labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True})
st.plotly_chart(fig)
# Step 10: Visualize original arguments using Plotly
fig = go.Figure()
# Add arrows for loadings
for argument in arguments:
fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']],
mode='lines+text', text=[None, argument], textposition='top center',
line=dict(color='red', width=2), showlegend=False))
# Add scatter plot for politicians
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers',
marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10),
text=df_arguments['name'], hoverinfo='text'))
fig.update_layout(title='PCA Biplot of Politicians and Arguments',
xaxis_title='PCA Component 1',
yaxis_title='PCA Component 2',
showlegend=False)
st.plotly_chart(fig)