electric_cars_project/docs2csv.py

from _arango import ArangoDB

arango = ArangoDB()
speeches = list(arango.all_ev_speeches())

normalized_arguments = []
for speech in speeches:
    for argument in speech['normalized_arguments']:
        normalized_arguments.append(argument)

normalized_arguments = list(set([f'"{arg}"' for arg in normalized_arguments]))

for argument in normalized_arguments:
    print(argument.replace('"', ''))
with open('speeches.csv', 'a+') as f:
    f.truncate(0)
    # Header row
    f.write(f'"_key";"name";"party";"text";"llm summary";{";".join(normalized_arguments)}\n')
    for speech in speeches:
        # Sanitize text by replacing double quotes with two double quotes and wrapping in double quotes
        sanitized_text = f'''"{speech["text"].replace('"', '""').replace(";", ",")}"}}'''
        sanitized_summary = f'''"{speech["llm_summary"].replace('"', '""').replace(";", ",")}"}}'''
        # Write the speech data, ensuring text fields are enclosed in double quotes
        f.write(f'"{speech["_key"]}";"{speech["name"]}";"{speech["party"]}";{sanitized_text};{sanitized_summary};')
        for argument in normalized_arguments:
            if argument.replace('"', '') in speech['normalized_arguments']:
                f.write('1;')
            else:
                f.write('0;')
        f.write('\n')