You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
3.2 KiB
80 lines
3.2 KiB
import pandas as pd |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from sklearn.decomposition import PCA |
|
from sklearn.cluster import KMeans |
|
from sklearn.preprocessing import StandardScaler |
|
import streamlit as st |
|
from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information |
|
|
|
# Step 1: Read the CSV file |
|
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') |
|
|
|
# Step 2: Extract relevant columns |
|
# Assuming the arguments start from the 5th column onwards |
|
arguments = df.columns[5:] |
|
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] |
|
|
|
# Step 3: Create a binary matrix for arguments |
|
# Convert the argument columns to integers |
|
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) |
|
|
|
# Step 4: Calculate sentiment scores for each politician |
|
def calculate_sentiment_score(row): |
|
score = 0 |
|
for arg in arguments: |
|
if row[arg] > 0: |
|
sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') |
|
if sentiment == 'positive': |
|
score += 1 |
|
elif sentiment == 'negative': |
|
score -= 1 |
|
return score |
|
|
|
df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) |
|
|
|
# # Step 5: Standardize the data |
|
# scaler = StandardScaler() |
|
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments]) |
|
|
|
# Step 6: Dimensionality reduction using PCA |
|
pca = PCA(n_components=2) |
|
pca_result = pca.fit_transform(df_arguments[arguments]) |
|
df_arguments['pca1'] = pca_result[:, 0] |
|
df_arguments['pca2'] = pca_result[:, 1] |
|
|
|
# Step 7: Examine loadings |
|
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments) |
|
|
|
# Step 8: Perform clustering |
|
kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed |
|
df_arguments['cluster'] = kmeans.fit_predict(pca_result) |
|
|
|
# Streamlit app |
|
st.title('Politicians Grouped by Arguments Used and Sentiment Score') |
|
|
|
# Step 9: Plot the data with clusters using Plotly |
|
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters', |
|
labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True}) |
|
st.plotly_chart(fig) |
|
|
|
# Step 10: Visualize original arguments using Plotly |
|
fig = go.Figure() |
|
|
|
# Add arrows for loadings |
|
for argument in arguments: |
|
fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']], |
|
mode='lines+text', text=[None, argument], textposition='top center', |
|
line=dict(color='red', width=2), showlegend=False)) |
|
|
|
# Add scatter plot for politicians |
|
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers', |
|
marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10), |
|
text=df_arguments['name'], hoverinfo='text')) |
|
|
|
fig.update_layout(title='PCA Biplot of Politicians and Arguments', |
|
xaxis_title='PCA Component 1', |
|
yaxis_title='PCA Component 2', |
|
showlegend=False) |
|
|
|
st.plotly_chart(fig) |