from arango_client import arango


def clean_talk(text):
    import re

    # Remove "STYLEREF Kantrubrik \* MERGEFORMAT" from the text
    if "STYLEREF Kantrubrik * MERGEFORMAT" in text:
        text = text.replace("STYLEREF Kantrubrik * MERGEFORMAT", "")
    # Remove "- " from the text when there are text on both sides, eg. till- sammans (this comes from line breaks in Word)
    text = re.sub(r"(?<=\S)-\s(?=\S)", "", text)
    # Remove linebreaks in the middle of sentences
    text = re.sub(r"(?<=[^\s.!?:;])\n(?=[a-zåäö])", " ", text)
    return text


if __name__ == "__main__":

    people = arango.db.aql.execute(
        "FOR p IN people RETURN {'namn': CONCAT(p.tilltalsnamn, ' ', p.efternamn), '_key': p._key}"
    )
    people_dict = {str(p["_key"]): p["namn"] for p in people}

    cursor = arango.db.aql.execute(
        """FOR t IN talks RETURN {'_id': t._id, 'anforandetext': t.anforandetext, 'avsnittsrubrik': t.avsnittsrubrik, 'parti': t.parti, 'intressent_id': t.intressent_id}""",
        batch_size=100,
        count=True,
    )

    cleaned_talks = []
    n = 0
    for talk in cursor:
        n += 1
        talk["anforandetext"] = clean_talk(talk.get("anforandetext", ""))
        talk["avsnittsrubrik"] = clean_talk(talk.get("avsnittsrubrik", ""))
        if talk.get("intressent_id") in people_dict:
            talk["talare"] = people_dict[str(talk.get("intressent_id"))]
        if talk["parti"] == "FP":
            talk["parti"] = "L"
        if talk["parti"] == "KDS":
            talk["parti"] = "KD"
        if talk["parti"] in [
            "TALMANNEN",
            "FÖRSTE VICE TALMANNEN",
            "ANDRE VICE TALMANNEN",
            "TREDJE VICE TALMANNEN",
            "ÅLDERSPRESIDENTEN",
            "HANS MAJESTÄT KONUNGEN",
            "TJÄNSTGÖRANDE ÅLDERSPRESIDENTEN",
        ]:
            # Make first letter uppercase and rest lowercase
            talk["parti"] = talk["parti"].title()
        if talk["parti"] == "":
            talk["parti"] = "-"
        cleaned_talks.append(talk)
        if len(cleaned_talks) >= 100:
            arango.db.collection("talks").update_many(cleaned_talks, silent=True)
            print(
                f"Processed {n} talks", end="\r"), 
            cleaned_talks = []