electric_cars_project/analyze_speeches.py

from _llm import LLM
from collections import Counter
from dotenv import load_dotenv
from _arango import ArangoDB
from all_arguments import arguments as all_arguments
from colorprinter.print_color import *
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np


def find_argument(argument):
    found_argument = None
    for key in all_arguments.keys():
        if argument.lower() in key.lower():
            found_argument = key
            break
    return found_argument


def extract_arguments(speeches):

    for speech in speeches:
        llm = LLM(chat=True)
        # Get the speech text
        text = speech["text"]
        # Make a prompt asking for arguments in the text about electric cars
        prompt = f'''Below is a speech in the European Union. Please provide arguments in the text about electric cars.\n
        """{text}"""\n
        What arguments are there in the text? An argument should be for or against something related to electric cars, or neutral. It should not be a political proposal like "we need electric cars".
        Answer ONLY with the arguments, no explanations, greetings or other text.
        Make the argument as detailed as possible so it is possible to understand why the argument is for or against electric cars.
        If there are no arguments, answer only with "None".
        Answer with one argument per line.
        '''
        # Generate arguments
        arguments = llm.generate(prompt)
        print(arguments)
        all_arguments = []
        general_arguments = []
        for argument in arguments.split("\n"):
            if "None" in argument:
                continue
            prompt = f"""Based on the speech earlier, make this argument more general and less specific, this will make it easier to compare the arguments with other arguments.
            "{argument}"
            Generalize the argument as much as possible, so it can be compared with other arguments. Answer ONLY with the generalized argument, no explanations, greetings or other text.
            """
            all_arguments.append(argument)
            general_argument = llm.generate(prompt)
            general_arguments.append(general_argument)
            print(">", general_argument)
        # Update the document with the arguments
        speech["arguments"] = all_arguments
        speech["general_arguments"] = general_arguments
        arango.update_ev_document(speech)
        print("---")


def categorize_arguments(arguments_string):

    from openai import OpenAI

    client = OpenAI(
        # This is the default and can be omitted
        api_key="sk-proj-5WJ1DIQfXdAHJQ0izfa1T3BlbkFJuWBpyJWJKal4MIMk3kbZ",
    )
    ""
    prompt = f'''I'm collecting arguments for and against electric cars. Here are the list of arguments I have collected so far:
    """
    {arguments_string}
    """
    I want to make the list much shorter, combining similar arguments into one argument (and the arguments thereby becoming less specific).
    Can you help me make the list shorter and make it into JSON data like the one below?
    {{
    "Problematic Resource Extraction": {{"argument": "The extraction and processing of raw materials for batteries (e.g., lithium, cobalt) cause serious environmental damage and involve toxic substances. Additionally, mining often involves significant social issues such as child labor and poor working conditions in developing countries.", "sentiment": "negative"}},
    "argument": "Lack of Affordability and Accessibility": "The shift to electric vehicles (EVs) could make car ownership less affordable for low-income individuals, especially given the current lack of charging infrastructure in rural areas.", "sentiment": "negative"}},
    }}
    I want to answer as a clean JSON text string, nothing else (as a will load the JSON data into a Python dictionary later on).
    I'm especially interested in arguments around climate, CO2 and environmental impact, and these can be more detailed than other arguments and not as general.
    Sometimes there are arguments both for and against electric cars within the same area of discussion (e.g., CO2 emissions). In these cases, formulate one more positive/supporting argument and one more negative/critical argument, like {{'Low CO2 emissions per km': '...', 'High Co2 emissions during production': '...'}} (but formulate them yourself, don't take this example as it is).
    '''
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="gpt-4o",
    )

    answer = chat_completion.choices[0].message.content
    print(answer)

    # Export to JSON file
    import json

    data = json.loads(answer)
    with open("arguments.json", "w") as f:
        json.dump(data, f, indent=4)


def normalize_arguments():

    for speech in speeches:
        normalized_arguments = []
        for argument in speech["arguments"]:

            llm = LLM(chat=True)
            if any(
                [
                    "no argument" in argument.lower(),
                    "no input" in argument.lower(),
                    "(" in argument,
                ]
            ):
                continue
            prompt = f'''Below is a speech in the European Union:\n
            """{speech["text"]}"""\n
            The following argument has been extracted from the speech:\n
            {argument}\n
            Please categorize the argument by choosing the most suitable category from the list below:\n
            {arguments4prompt}
            If the argument does not fit any of the categories, please choose "None".\n
            Answer ONLY with the category, no explanations, greetings or other text.
            '''
            n = 0
            argument = None
            while True:
                n += 1
                answer = llm.generate(prompt)
                if "None" in answer:
                    print("None")
                    break

                argument = find_argument(answer)
                if argument or n > 3:
                    break
                else:
                    print('Error:', argument)
                    prompt += "\nPlease choose a category from the provided list, and answer EXACTLY as it is written in the list."
            if argument:
                print(argument)
                normalized_arguments.append(argument)
        speech["normalized_arguments"] = normalized_arguments
        arango.update_ev_document(speech)


arango = ArangoDB()
speeches = list(arango.all_ev_speeches())
print('Number of speeches:', len(speeches))
arguments = []
for speech in speeches:
    for argument in speech["general_arguments"]:
        if any(
            [
                "no argument" in argument.lower(),
                "no input" in argument.lower(),
                "(" in argument,
            ]
        ):
            continue
        arguments.append(argument)

arguments_string = "\n-".join(arguments)

arguments4prompt = ""
for argument, values in all_arguments.items():
    arguments4prompt += f'- {argument.upper()}: {values["argument"]})\n'

#normalize_arguments()


arguments = {}
speakers = {}
speakers_arguments = {}
for speech in speeches:
    if 'normalized_arguments' not in speech:
        continue
    args = speech["normalized_arguments"]
    for arg in args:
        argd = all_arguments[arg]
        argd['name'] = speech['name']
        if arg not in arguments:
            arguments[arg] = []
        arguments[arg].append(argd)

        if speech['name'] not in speakers:
            speakers[speech['name']] = 0
        if argd['sentiment'] == 'positive':
            speakers[speech['name']] += 1
        else:
            speakers[speech['name']] -= 1

        if speech['name'] not in speakers_arguments:
            speakers_arguments[speech['name']] = []
        speakers_arguments[speech['name']].append(arg)

sorted_arguments = []

for argument, usage in arguments.items():
    arg = all_arguments[argument]
    sorted_arguments.append((argument, len(usage), arg['sentiment']))

# Sort the list by usage count in descending order
sorted_arguments.sort(key=lambda x: x[1], reverse=True)

print('\n\nArguments:\n')
positive_arguments = [(argument, usage) for argument, usage, sentiment in sorted_arguments if sentiment == 'positive']
negative_arguments = [(argument, usage) for argument, usage, sentiment in sorted_arguments if sentiment == 'negative']

for argument, usage in positive_arguments:
    print_green('+ ', argument, usage)

for argument, usage in negative_arguments:
    print_red('- ', argument, usage)


sorted_speakers = []
for speaker, sentiment in speakers.items():
    sorted_speakers.append((speaker, sentiment))
sorted_speakers.sort(key=lambda x: x[1], reverse=True)

top_and_bottom_speakers = sorted_speakers[:5] + sorted_speakers[-5:]
print('\n\nSpeakers:\n')
for speaker, sentiment in top_and_bottom_speakers:
    if sentiment > 0:
        print_green('+ ', speaker, sentiment)
    elif sentiment < 0:
        print_red('- ', speaker, sentiment)
    else:
        print_yellow('0 ', speaker, sentiment)

print()