diff --git a/scripts/clean_talks.py b/scripts/clean_talks.py new file mode 100644 index 0000000..4375c28 --- /dev/null +++ b/scripts/clean_talks.py @@ -0,0 +1,60 @@ +from arango_client import arango + + +def clean_talk(text): + import re + + # Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text + if "STYLEREF Kantrubrik * MERGEFORMAT" in text: + text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "") + # Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word) + text = re.sub(r"(?<=\S)-\s(?=\S)", "", text) + # Remove linebreaks in the middle of sentences + text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text) + return text + + +if __name__ == "__main__": + + people = arango.db.aql.execute( + "FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}" + ) + people_dict = {str(p["_key"]): p["namn"] for p in people} + + cursor = arango.db.aql.execute( + """FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""", + batch_size=100, + count=True, + ) + + cleaned_talks = [] + n = 0 + for talk in cursor: + n += 1 + talk["anforandetext"] = clean_talk(talk.get("anforandetext", "")) + talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", "")) + if talk.get("intressent_id") in people_dict: + talk["talare"] = people_dict[str(talk.get("intressent_id"))] + if talk["parti"] == "FP": + talk["parti"] = "L" + if talk["parti"] == "KDS": + talk["parti"] = "KD" + if talk["parti"] in [ + "TALMANNEN", + "FÖRSTE VICE TALMANNEN", + "ANDRE VICE TALMANNEN", + "TREDJE VICE TALMANNEN", + "ÅLDERSPRESIDENTEN", + "HANS MAJESTÄT KONUNGEN", + "TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN", + ]: + # Make first letter uppercase and rest lowercase + talk["parti"] = talk["parti"].title() + if talk["parti"] == "": + talk["parti"] = "-" + cleaned_talks.append(talk) + if len(cleaned_talks) >= 100: + arango.db.collection("talks").update_many(cleaned_talks, silent=True) + print( + f"Processed {n} talks", end="\r"), + cleaned_talks = []