import pandas as pd import plotly.express as px import plotly.graph_objects as go from sklearn.decomposition import PCA from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import streamlit as st from all_arguments import arguments as arguments_dict # Arguments dictionary with sentiment information # Step 1: Read the CSV file df = pd.read_csv('Blad 1-speeches_sep.csv', delimiter=';') # Step 2: Extract relevant columns # Assuming the arguments start from the 5th column onwards arguments = df.columns[5:] df_arguments = df.loc[:, ['_key', 'name'] + list(arguments)] # Step 3: Create a binary matrix for arguments # Convert the argument columns to integers df_arguments.loc[:, arguments] = df_arguments.loc[:, arguments].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int) # Step 4: Calculate sentiment scores for each politician def calculate_sentiment_score(row): score = 0 for arg in arguments: if row[arg] > 0: sentiment = arguments_dict.get(arg, {}).get('sentiment', 'neutral') if sentiment == 'positive': score += 1 elif sentiment == 'negative': score -= 1 return score df_arguments['sentiment_score'] = df_arguments.apply(calculate_sentiment_score, axis=1) # # Step 5: Standardize the data # scaler = StandardScaler() # df_arguments[arguments] = scaler.fit_transform(df_arguments[arguments]) # Step 6: Dimensionality reduction using PCA pca = PCA(n_components=2) pca_result = pca.fit_transform(df_arguments[arguments]) df_arguments['pca1'] = pca_result[:, 0] df_arguments['pca2'] = pca_result[:, 1] # Step 7: Examine loadings loadings = pd.DataFrame(pca.components_.T, columns=['PCA1', 'PCA2'], index=arguments) # Step 8: Perform clustering kmeans = KMeans(n_clusters=3) # Adjust the number of clusters as needed df_arguments['cluster'] = kmeans.fit_predict(pca_result) # Streamlit app st.title('Politicians Grouped by Arguments Used and Sentiment Score') # Step 9: Plot the data with clusters using Plotly fig = px.scatter(df_arguments, x='pca1', y='pca2', color='cluster', title='Politicians Grouped by Arguments Used and Clusters', labels={'pca1': 'PCA Component 1', 'pca2': 'PCA Component 2'}, hover_data={'name': True}) st.plotly_chart(fig) # Step 10: Visualize original arguments using Plotly fig = go.Figure() # Add arrows for loadings for argument in arguments: fig.add_trace(go.Scatter(x=[0, loadings.loc[argument, 'PCA1']], y=[0, loadings.loc[argument, 'PCA2']], mode='lines+text', text=[None, argument], textposition='top center', line=dict(color='red', width=2), showlegend=False)) # Add scatter plot for politicians fig.add_trace(go.Scatter(x=df_arguments['pca1'], y=df_arguments['pca2'], mode='markers', marker=dict(color=df_arguments['sentiment_score'], colorscale='Viridis', size=10), text=df_arguments['name'], hoverinfo='text')) fig.update_layout(title='PCA Biplot of Politicians and Arguments', xaxis_title='PCA Component 1', yaxis_title='PCA Component 2', showlegend=False) st.plotly_chart(fig)