parent
bd922c498f
commit
54db0e339b
1 changed files with 60 additions and 0 deletions
@ -0,0 +1,60 @@ |
|||||||
|
from arango_client import arango |
||||||
|
|
||||||
|
|
||||||
|
def clean_talk(text): |
||||||
|
import re |
||||||
|
|
||||||
|
# Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text |
||||||
|
if "STYLEREF Kantrubrik * MERGEFORMAT" in text: |
||||||
|
text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "") |
||||||
|
# Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word) |
||||||
|
text = re.sub(r"(?<=\S)-\s(?=\S)", "", text) |
||||||
|
# Remove linebreaks in the middle of sentences |
||||||
|
text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text) |
||||||
|
return text |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
|
||||||
|
people = arango.db.aql.execute( |
||||||
|
"FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}" |
||||||
|
) |
||||||
|
people_dict = {str(p["_key"]): p["namn"] for p in people} |
||||||
|
|
||||||
|
cursor = arango.db.aql.execute( |
||||||
|
"""FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""", |
||||||
|
batch_size=100, |
||||||
|
count=True, |
||||||
|
) |
||||||
|
|
||||||
|
cleaned_talks = [] |
||||||
|
n = 0 |
||||||
|
for talk in cursor: |
||||||
|
n += 1 |
||||||
|
talk["anforandetext"] = clean_talk(talk.get("anforandetext", "")) |
||||||
|
talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", "")) |
||||||
|
if talk.get("intressent_id") in people_dict: |
||||||
|
talk["talare"] = people_dict[str(talk.get("intressent_id"))] |
||||||
|
if talk["parti"] == "FP": |
||||||
|
talk["parti"] = "L" |
||||||
|
if talk["parti"] == "KDS": |
||||||
|
talk["parti"] = "KD" |
||||||
|
if talk["parti"] in [ |
||||||
|
"TALMANNEN", |
||||||
|
"FÖRSTE VICE TALMANNEN", |
||||||
|
"ANDRE VICE TALMANNEN", |
||||||
|
"TREDJE VICE TALMANNEN", |
||||||
|
"ÅLDERSPRESIDENTEN", |
||||||
|
"HANS MAJESTÄT KONUNGEN", |
||||||
|
"TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN", |
||||||
|
]: |
||||||
|
# Make first letter uppercase and rest lowercase |
||||||
|
talk["parti"] = talk["parti"].title() |
||||||
|
if talk["parti"] == "": |
||||||
|
talk["parti"] = "-" |
||||||
|
cleaned_talks.append(talk) |
||||||
|
if len(cleaned_talks) >= 100: |
||||||
|
arango.db.collection("talks").update_many(cleaned_talks, silent=True) |
||||||
|
print( |
||||||
|
f"Processed {n} talks", end="\r"), |
||||||
|
cleaned_talks = [] |
||||||
Loading…
Reference in new issue