from arango_client import arango def clean_talk(text): import re # Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text if "STYLEREF Kantrubrik * MERGEFORMAT" in text: text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "") # Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word) text = re.sub(r"(?<=\S)-\s(?=\S)", "", text) # Remove linebreaks in the middle of sentences text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text) return text if __name__ == "__main__": people = arango.db.aql.execute( "FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}" ) people_dict = {str(p["_key"]): p["namn"] for p in people} cursor = arango.db.aql.execute( """FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""", batch_size=100, count=True, ) cleaned_talks = [] n = 0 for talk in cursor: n += 1 talk["anforandetext"] = clean_talk(talk.get("anforandetext", "")) talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", "")) if talk.get("intressent_id") in people_dict: talk["talare"] = people_dict[str(talk.get("intressent_id"))] if talk["parti"] == "FP": talk["parti"] = "L" if talk["parti"] == "KDS": talk["parti"] = "KD" if talk["parti"] in [ "TALMANNEN", "FÖRSTE VICE TALMANNEN", "ANDRE VICE TALMANNEN", "TREDJE VICE TALMANNEN", "ÅLDERSPRESIDENTEN", "HANS MAJESTÄT KONUNGEN", "TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN", ]: # Make first letter uppercase and rest lowercase talk["parti"] = talk["parti"].title() if talk["parti"] == "": talk["parti"] = "-" cleaned_talks.append(talk) if len(cleaned_talks) >= 100: arango.db.collection("talks").update_many(cleaned_talks, silent=True) print( f"Processed {n} talks", end="\r"), cleaned_talks = []