Implement clean_talk function and main script for processing talks

master
Lasse Server 2 months ago
parent bd922c498f
commit 54db0e339b
  1. 60
      scripts/clean_talks.py

@ -0,0 +1,60 @@
from arango_client import arango
def clean_talk(text):
import re
# Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text
if "STYLEREF Kantrubrik * MERGEFORMAT" in text:
text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "")
# Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word)
text = re.sub(r"(?<=\S)-\s(?=\S)", "", text)
# Remove linebreaks in the middle of sentences
text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text)
return text
if __name__ == "__main__":
people = arango.db.aql.execute(
"FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}"
)
people_dict = {str(p["_key"]): p["namn"] for p in people}
cursor = arango.db.aql.execute(
"""FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""",
batch_size=100,
count=True,
)
cleaned_talks = []
n = 0
for talk in cursor:
n += 1
talk["anforandetext"] = clean_talk(talk.get("anforandetext", ""))
talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", ""))
if talk.get("intressent_id") in people_dict:
talk["talare"] = people_dict[str(talk.get("intressent_id"))]
if talk["parti"] == "FP":
talk["parti"] = "L"
if talk["parti"] == "KDS":
talk["parti"] = "KD"
if talk["parti"] in [
"TALMANNEN",
"FÖRSTE VICE TALMANNEN",
"ANDRE VICE TALMANNEN",
"TREDJE VICE TALMANNEN",
"ÅLDERSPRESIDENTEN",
"HANS MAJESTÄT KONUNGEN",
"TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN",
]:
# Make first letter uppercase and rest lowercase
talk["parti"] = talk["parti"].title()
if talk["parti"] == "":
talk["parti"] = "-"
cleaned_talks.append(talk)
if len(cleaned_talks) >= 100:
arango.db.collection("talks").update_many(cleaned_talks, silent=True)
print(
f"Processed {n} talks", end="\r"),
cleaned_talks = []
Loading…
Cancel
Save