electric_cars_project/group_speakers_streamlit.py

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import streamlit as st
from all_arguments import arguments as arguments_dict  # Arguments dictionary with sentiment information

# Step 1: Read the CSV file
df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';')

# Step 2: Extract relevant columns
# Assuming the arguments start from the 5th column onwards
arguments = df.columns[5:]
df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)]

# Step 3: Create a binary matrix for arguments
# Convert the argument columns to integers
df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Step 4: Calculate sentiment scores for each politician
def calculate_sentiment_score(row):
    score = 0
    for arg in arguments:
        if row[arg] > 0:
            sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral')
            if sentiment == 'positive':
                score += 1
            elif sentiment == 'negative':
                score -= 1
    return score

df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1)

# # Step 5: Standardize the data
# scaler = StandardScaler()
# df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments])

# Step 6: Dimensionality reduction using PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df_arguments[arguments])
df_arguments['pca1'] = pca_result[:, 0]
df_arguments['pca2'] = pca_result[:, 1]

# Step 7: Examine loadings
loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments)

# Step 8: Perform clustering
kmeans = KMeans(n_clusters=3)  # Adjust the number of clusters as needed
df_arguments['cluster'] = kmeans.fit_predict(pca_result)

# Streamlit app
st.title('Politicians Grouped by Arguments Used and Sentiment Score')

# Step 9: Plot the data with clusters using Plotly
fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters',
                 labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True})
st.plotly_chart(fig)

# Step 10: Visualize original arguments using Plotly
fig = go.Figure()

# Add arrows for loadings
for argument in arguments:
    fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']],
                             mode='lines+text', text=[None, argument], textposition='top center',
                             line=dict(color='red', width=2), showlegend=False))

# Add scatter plot for politicians
fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers',
                         marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10),
                         text=df_arguments['name'], hoverinfo='text'))

fig.update_layout(title='PCA Biplot of Politicians and Arguments',
                  xaxis_title='PCA Component 1',
                  yaxis_title='PCA Component 2',
                  showlegend=False)

st.plotly_chart(fig)