parent
bd922c498f
commit
54db0e339b
1 changed files with 60 additions and 0 deletions
@ -0,0 +1,60 @@ |
||||
from arango_client import arango |
||||
|
||||
|
||||
def clean_talk(text): |
||||
import re |
||||
|
||||
# Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text |
||||
if "STYLEREF Kantrubrik * MERGEFORMAT" in text: |
||||
text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "") |
||||
# Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word) |
||||
text = re.sub(r"(?<=\S)-\s(?=\S)", "", text) |
||||
# Remove linebreaks in the middle of sentences |
||||
text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text) |
||||
return text |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
|
||||
people = arango.db.aql.execute( |
||||
"FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}" |
||||
) |
||||
people_dict = {str(p["_key"]): p["namn"] for p in people} |
||||
|
||||
cursor = arango.db.aql.execute( |
||||
"""FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""", |
||||
batch_size=100, |
||||
count=True, |
||||
) |
||||
|
||||
cleaned_talks = [] |
||||
n = 0 |
||||
for talk in cursor: |
||||
n += 1 |
||||
talk["anforandetext"] = clean_talk(talk.get("anforandetext", "")) |
||||
talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", "")) |
||||
if talk.get("intressent_id") in people_dict: |
||||
talk["talare"] = people_dict[str(talk.get("intressent_id"))] |
||||
if talk["parti"] == "FP": |
||||
talk["parti"] = "L" |
||||
if talk["parti"] == "KDS": |
||||
talk["parti"] = "KD" |
||||
if talk["parti"] in [ |
||||
"TALMANNEN", |
||||
"FÖRSTE VICE TALMANNEN", |
||||
"ANDRE VICE TALMANNEN", |
||||
"TREDJE VICE TALMANNEN", |
||||
"ÅLDERSPRESIDENTEN", |
||||
"HANS MAJESTÄT KONUNGEN", |
||||
"TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN", |
||||
]: |
||||
# Make first letter uppercase and rest lowercase |
||||
talk["parti"] = talk["parti"].title() |
||||
if talk["parti"] == "": |
||||
talk["parti"] = "-" |
||||
cleaned_talks.append(talk) |
||||
if len(cleaned_talks) >= 100: |
||||
arango.db.collection("talks").update_many(cleaned_talks, silent=True) |
||||
print( |
||||
f"Processed {n} talks", end="\r"), |
||||
cleaned_talks = [] |
||||
Loading…
Reference in new issue