Initial commit

1 year ago · 08e17d13a5
commit 08e17d13a5
7 changed files with 484 additions and 0 deletions
--- a/_arango.py
+++ b/_arango.py
@ -0,0 +1,55 @@
+
+from arango import ArangoClient
+from dotenv import load_dotenv
+import os
+load_dotenv() # Install with pip install python-dotenv
+class ArangoDB:
+    def __init__(self):
+        """
+        Initializes an instance of the ArangoEVClass.
+
+        Args:
+            db_name (str): The name of the database.
+            username (str): The username for authentication.
+            password (str): The password for authentication.
+        """
+        password = os.getenv("PASSWORD_ARANGO")
+        self.client = ArangoClient(hosts=os.getenv("ARANGO_HOSTS"))
+        self.db = self.client.db('ev_dataharvest', username='dataharvest', password=password)
+
+
+    def all_ev_speeches(self):
+            """
+            Retrieves all EV speeches from the 'ev_speeches' collection.
+
+            Returns:
+                A cursor object containing all EV speeches.
+            """
+            return self.db.collection('ev_speeches').all()
+
+    def update_ev_document(self, document):
+        """
+        Updates an EV document in the 'ev_speeches' collection.
+
+        Args:
+            document: The document to be updated.
+
+        Returns:
+            None
+        """
+        self.db.collection('ev_speeches').update(document, merge=False)
+
+    def get_document_by_id(self, document_id):
+            """
+            Retrieves a document from the 'ev_speeches' collection by its ID.
+
+            Args:
+                document_id (str): The ID of the document to retrieve.
+
+            Returns:
+                dict: The retrieved document.
+
+            """
+            if '/' in document_id:
+                document_id = document_id.split('/')[-1]
+            return self.db.collection('ev_speeches').get(document_id)
--- a/_openai.py
+++ b/_openai.py
@ -0,0 +1,52 @@
+from openai import OpenAI as OAI
+
+class OpenAI:
+    """
+    A class that interacts with the OpenAI API for generating chat-based responses.
+
+    Attributes:
+        chat (bool): Indicates whether the chat mode is enabled.
+        client: An instance of the OpenAI API client.
+        messages (list): A list of messages exchanged between the user and the assistant.
+    """
+
+    def __init__(self, chat=False, system_prompt=None):
+        """
+        Initializes a new instance of the OpenAI class.
+
+        Args:
+            chat (bool, optional): Indicates whether the chat mode is enabled. Defaults to False.
+        """
+        self.chat = chat
+        self.system_prompt = system_prompt
+        self.client = OAI(
+            # This is the default and can be omitted
+            api_key="sk-proj-5WJ1DIQfXdAHJQ0izfa1T3BlbkFJuWBpyJWJKal4MIMk3kbZ",
+        )
+        self.messages = []
+
+        if self.system_prompt:
+            self.messages.append({"role": "system", "content": self.system_prompt})
+
+    def generate(self, prompt):
+        """
+        Generates a chat-based response using the OpenAI API.
+
+        Args:
+            prompt (str): The user's input prompt.
+
+        Returns:
+            str: The generated response from the OpenAI model.
+        """
+        self.messages.append({"role": "user", "content": prompt})
+
+        chat_completion = self.client.chat.completions.create(
+            messages=self.messages,
+            model="gpt-4o",
+        )
+        answer = chat_completion.choices[0].message.content
+        if self.chat:
+            self.messages.append({"role": "assistant", "content": answer})
+
+        return answer
+        
--- a/analyse_arguments.py
+++ b/analyse_arguments.py
@ -0,0 +1,28 @@
+from _openai import OpenAI
+from _arango import ArangoDB
+
+arango = ArangoDB()
+openai = OpenAI()
+speeches = list(arango.all_ev_speeches())
+
+env_against = []
+
+for speech in speeches:
+    if 'Environmental Concerns During Production' in speech["normalized_arguments"]:
+        env_against.append((speech['name'], speech['text']))
+
+s = ''
+for name, text in env_against:
+    s += f'{name.upper()}:\n{text}\n\n'
+
+prompt = f"""The following politicians have expressed concerns about the environmental impact of electric vehicle production, particularly regarding "Environmental Concerns During Production". I'm interested in how they argues around CO2 emissions. Could you provide me with some insights? Please answer the following question:\n
+- What is the biggest concern around electric cars?
+- How often are they mentioning CO2 emissions?
+- What argumentation do they use to argue against electric cars if they mention CO2 emissions?
+- Are they relating to the CO2 budget defined by the United Nations?
+Please answer in a structured way and provide as much detail as possible. Also, include examples from the speeches if possible.
+Keep to the information provided in the speeches!
+"""
+
+answer = openai.generate(prompt)
+print(answer)
--- a/analyze_speeches.py
+++ b/analyze_speeches.py
@ -0,0 +1,239 @@
+from _llm import LLM
+from collections import Counter
+from dotenv import load_dotenv
+from _arango import ArangoDB
+from arguments import arguments as all_arguments
+from colorprinter.print_color import *
+import matplotlib.pyplot as plt
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+
+def find_argument(argument):
+    found_argument = None
+    for key in all_arguments.keys():
+        if argument.lower() in key.lower():
+            found_argument = key
+            break
+    return found_argument
+
+
+def extract_arguments(speeches):
+
+    for speech in speeches:
+        llm = LLM(chat=True)
+        # Get the speech text
+        text = speech["text"]
+        # Make a prompt asking for arguments in the text about electric cars
+        prompt = f'''Below is a speech in the European Union. Please provide arguments in the text about electric cars.\n
+        """{text}"""\n
+        What arguments are there in the text? An argument should be for or against something related to electric cars, or neutral. It should not be a political proposal like "we need electric cars".
+        Answer ONLY with the arguments, no explanations, greetings or other text.
+        Make the argument as detailed as possible so it is possible to understand why the argument is for or against electric cars.
+        If there are no arguments, answer only with "None".
+        Answer with one argument per line.
+        '''
+        # Generate arguments
+        arguments = llm.generate(prompt)
+        print(arguments)
+        all_arguments = []
+        general_arguments = []
+        for argument in arguments.split("\n"):
+            if "None" in argument:
+                continue
+            prompt = f"""Based on the speech earlier, make this argument more general and less specific, this will make it easier to compare the arguments with other arguments.
+            "{argument}"
+            Generalize the argument as much as possible, so it can be compared with other arguments. Answer ONLY with the generalized argument, no explanations, greetings or other text.
+            """
+            all_arguments.append(argument)
+            general_argument = llm.generate(prompt)
+            general_arguments.append(general_argument)
+            print(">", general_argument)
+        # Update the document with the arguments
+        speech["arguments"] = all_arguments
+        speech["general_arguments"] = general_arguments
+        arango.update_ev_document(speech)
+        print("---")
+
+
+def categorize_arguments(arguments_string):
+
+    from openai import OpenAI
+
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key="sk-proj-5WJ1DIQfXdAHJQ0izfa1T3BlbkFJuWBpyJWJKal4MIMk3kbZ",
+    )
+    ""
+    prompt = f'''I'm collecting arguments for and against electric cars. Here are the list of arguments I have collected so far:
+    """
+    {arguments_string}
+    """
+    I want to make the list much shorter, combining similar arguments into one argument (and the arguments thereby becoming less specific).
+    Can you help me make the list shorter and make it into JSON data like the one below? 
+    {{
+    "Problematic Resource Extraction": {{"argument": "The extraction and processing of raw materials for batteries (e.g., lithium, cobalt) cause serious environmental damage and involve toxic substances. Additionally, mining often involves significant social issues such as child labor and poor working conditions in developing countries.", "sentiment": "negative"}},
+    "argument": "Lack of Affordability and Accessibility": "The shift to electric vehicles (EVs) could make car ownership less affordable for low-income individuals, especially given the current lack of charging infrastructure in rural areas.", "sentiment": "negative"}},
+    }}
+    I want to answer as a clean JSON text string, nothing else (as a will load the JSON data into a Python dictionary later on).
+    I'm especially interested in arguments around climate, CO2 and environmental impact, and these can be more detailed than other arguments and not as general.
+    Sometimes there are arguments both for and against electric cars within the same area of discussion (e.g., CO2 emissions). In these cases, formulate one more positive/supporting argument and one more negative/critical argument, like {{'Low CO2 emissions per km': '...', 'High Co2 emissions during production': '...'}} (but formulate them yourself, don't take this example as it is).
+    '''
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        model="gpt-4o",
+    )
+
+    answer = chat_completion.choices[0].message.content
+    print(answer)
+
+    # Export to JSON file
+    import json
+
+    data = json.loads(answer)
+    with open("arguments.json", "w") as f:
+        json.dump(data, f, indent=4)
+
+
+def normalize_arguments():
+
+    for speech in speeches:
+        normalized_arguments = []
+        for argument in speech["arguments"]:
+
+            llm = LLM(chat=True)
+            if any(
+                [
+                    "no argument" in argument.lower(),
+                    "no input" in argument.lower(),
+                    "(" in argument,
+                ]
+            ):
+                continue
+            prompt = f'''Below is a speech in the European Union:\n
+            """{speech["text"]}"""\n
+            The following argument has been extracted from the speech:\n
+            {argument}\n
+            Please categorize the argument by choosing the most suitable category from the list below:\n
+            {arguments4prompt}
+            If the argument does not fit any of the categories, please choose "None".\n
+            Answer ONLY with the category, no explanations, greetings or other text.
+            '''
+            n = 0
+            argument = None
+            while True:
+                n += 1
+                answer = llm.generate(prompt)
+                if "None" in answer:
+                    print("None")
+                    break
+
+                argument = find_argument(answer)
+                if argument or n > 3:
+                    break
+                else:
+                    print('Error:', argument)
+                    prompt += "\nPlease choose a category from the provided list, and answer EXACTLY as it is written in the list."
+            if argument:
+                print(argument)
+                normalized_arguments.append(argument)
+        speech["normalized_arguments"] = normalized_arguments
+        arango.update_ev_document(speech)
+
+
+
+arango = ArangoDB()
+speeches = list(arango.all_ev_speeches())
+print('Number of speeches:', len(speeches))
+arguments = []
+for speech in speeches:
+    for argument in speech["general_arguments"]:
+        if any(
+            [
+                "no argument" in argument.lower(),
+                "no input" in argument.lower(),
+                "(" in argument,
+            ]
+        ):
+            continue
+        arguments.append(argument)
+
+arguments_string = "\n-".join(arguments)
+
+arguments4prompt = ""
+for argument, values in all_arguments.items():
+    arguments4prompt += f'- {argument.upper()}: {values["argument"]})\n'
+
+#normalize_arguments()
+
+
+arguments = {}
+speakers = {}
+speakers_arguments = {}
+for speech in speeches:
+    if 'normalized_arguments' not in speech:
+        continue
+    args = speech["normalized_arguments"]
+    for arg in args:
+        argd = all_arguments[arg]
+        argd['name'] = speech['name']
+        if arg not in arguments:
+            arguments[arg] = []
+        arguments[arg].append(argd)
+
+        if speech['name'] not in speakers:
+            speakers[speech['name']] = 0
+        if argd['sentiment'] == 'positive':
+            speakers[speech['name']] += 1
+        else:
+            speakers[speech['name']] -= 1
+
+        if speech['name'] not in speakers_arguments:
+            speakers_arguments[speech['name']] = []
+        speakers_arguments[speech['name']].append(arg)
+
+sorted_arguments = []
+
+for argument, usage in arguments.items():
+    arg = all_arguments[argument]
+    sorted_arguments.append((argument, len(usage), arg['sentiment']))
+
+# Sort the list by usage count in descending order
+sorted_arguments.sort(key=lambda x: x[1], reverse=True)
+
+print('\n\nArguments:\n')
+positive_arguments = [(argument, usage) for argument, usage, sentiment in sorted_arguments if sentiment == 'positive']
+negative_arguments = [(argument, usage) for argument, usage, sentiment in sorted_arguments if sentiment == 'negative']
+
+for argument, usage in positive_arguments:
+    print_green('+ ', argument, usage)
+
+for argument, usage in negative_arguments:
+    print_red('- ', argument, usage)
+
+
+sorted_speakers = []
+for speaker, sentiment in speakers.items():
+    sorted_speakers.append((speaker, sentiment))
+sorted_speakers.sort(key=lambda x: x[1], reverse=True)
+
+top_and_bottom_speakers = sorted_speakers[:5] + sorted_speakers[-5:]
+print('\n\nSpeakers:\n')
+for speaker, sentiment in top_and_bottom_speakers:
+    if sentiment > 0:
+        print_green('+ ', speaker, sentiment)
+    elif sentiment < 0:
+        print_red('- ', speaker, sentiment)
+    else:
+        print_yellow('0 ', speaker, sentiment)
+
+print()
+
--- a/arguments.py
+++ b/arguments.py
@ -0,0 +1,78 @@
+arguments = {
+    "Problematic Resource Extraction": {
+        "argument": "The extraction and processing of raw materials for batteries (e.g., lithium, cobalt) cause serious environmental damage and involve toxic substances. Additionally, mining often involves significant social issues such as child labor and poor working conditions in developing countries.",
+        "sentiment": "negative",
+    },
+    "Lack of Affordability and Accessibility": {
+        "argument": "The shift to electric vehicles (EVs) could make car ownership less affordable for low-income individuals, especially given the current lack of charging infrastructure in rural areas.",
+        "sentiment": "negative",
+    },
+    "Insufficient Infrastructure": {
+        "argument": "The lack of sufficient charging stations and support infrastructure hinders the widespread adoption of electric vehicles.",
+        "sentiment": "negative",
+    },
+    "Environmental Benefits of EVs": {
+        "argument": "Electric vehicles significantly reduce air pollution and greenhouse gas emissions, promoting a cleaner and healthier environment.",
+        "sentiment": "positive",
+    },
+    "Environmental Concerns During Production": {
+        "argument": "The production process of electric vehicles, including battery manufacturing, has a high environmental impact, including substantial CO2 emissions.",
+        "sentiment": "negative",
+    },
+    "Dependency on Foreign Resources": {
+        "argument": "Electric vehicles increase reliance on imported raw materials, potentially creating geopolitical dependencies and supply chain vulnerabilities.",
+        "sentiment": "negative",
+    },
+    "Support for Sustainable Transportation": {
+        "argument": "Electric vehicles promote sustainable transportation by relying on electricity, which can be sourced from renewable energy.",
+        "sentiment": "positive",
+    },
+    "Economic Impact and Job Losses": {
+        "argument": "Transitioning to electric vehicles may lead to job losses in traditional automotive and related industries.",
+        "sentiment": "negative",
+    },
+    "Technological Advancements": {
+        "argument": "Innovations in electric vehicle technology offer economic growth opportunities and can create new job sectors.",
+        "sentiment": "positive",
+    },
+    "High Upfront Costs": {
+        "argument": "Electric vehicles have higher upfront purchase costs compared to traditional vehicles, which can be a barrier to adoption for many consumers.",
+        "sentiment": "negative",
+    },
+    "Lower Operating Costs": {
+        "argument": "Electric vehicles have lower operating costs due to fewer maintenance requirements and lower fuel expenses, making them cost-effective in the long run.",
+        "sentiment": "positive",
+    },
+    "CO2 Emissions During Operation": {
+        "argument": "Electric vehicles produce lower CO2 emissions per km traveled compared to internal combustion engine vehicles, especially when powered by renewable energy.",
+        "sentiment": "positive",
+    },
+    "CO2 Emissions During Production": {
+        "argument": "The manufacturing process of electric vehicles, particularly battery production, involves significant CO2 emissions.",
+        "sentiment": "negative",
+    },
+    "Support from Government and Incentives": {
+        "argument": "Government subsidies and tax incentives for electric vehicles help make them more affordable and accelerate their adoption.",
+        "sentiment": "positive",
+    },
+    "Need for Electricity Supply": {
+        "argument": "The increased demand for electricity to power a large number of electric vehicles may strain existing power grids and require additional energy production, which could rely on fossil fuels.",
+        "sentiment": "negative",
+    },
+    "Environmental Impact of Electricity Source": {
+        "argument": "The environmental benefits of electric vehicles depend on the cleanliness of the electricity grid they are charged from. Using renewable energy sources maximizes these benefits.",
+        "sentiment": "positive",
+    },
+    "Resource Recycling and Waste Management": {
+        "argument": "Effective recycling of electric vehicle components, especially batteries, can minimize environmental damage and reduce the need for raw material extraction.",
+        "sentiment": "positive",
+    },
+    "Employment Opportunities": {
+        "argument": "The growth of the electric vehicle industry can create new job opportunities in manufacturing, infrastructure development, and renewable energy sectors.",
+        "sentiment": "positive",
+    },
+    "Adoption Hindered by Rural Areas": {
+        "argument": "Rural areas often lack the necessary charging infrastructure, making electric vehicle adoption more challenging in those regions.",
+        "sentiment": "negative",
+    },
+}
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit 02acd147698a2969e096404037f87217cdc9bcea
--- a/docs2csv.py
+++ b/docs2csv.py
@ -0,0 +1,31 @@
+from _arango import ArangoDB
+
+arango = ArangoDB()
+speeches = list(arango.all_ev_speeches())
+
+normalized_arguments = []
+for speech in speeches:
+    for argument in speech['normalized_arguments']:
+        normalized_arguments.append(argument)
+
+normalized_arguments = list(set([f'"{arg}"' for arg in normalized_arguments]))
+
+for argument in normalized_arguments:
+    print(argument.replace('"', '')) 
+with open('speeches.csv', 'a+') as f:
+    f.truncate(0)
+    # Header row
+    f.write(f'"_key";"name";"party";"text";"llm summary";{";".join(normalized_arguments)}\n')
+    for speech in speeches:
+        # Sanitize text by replacing double quotes with two double quotes and wrapping in double quotes
+        sanitized_text = f'''"{speech["text"].replace('"', '""').replace(";", ",")}"}}'''
+        sanitized_summary = f'''"{speech["llm_summary"].replace('"', '""').replace(";", ",")}"}}'''
+        # Write the speech data, ensuring text fields are enclosed in double quotes
+        f.write(f'"{speech["_key"]}";"{speech["name"]}";"{speech["party"]}";{sanitized_text};{sanitized_summary};')
+        for argument in normalized_arguments:
+            if argument.replace('"', '') in speech['normalized_arguments']:
+                f.write('1;')
+            else:
+                f.write('0;')
+        f.write('\n')
+
				`@ -0,0 +1 @@`
				`Subproject commit 02acd147698a2969e096404037f87217cdc9bcea`