import os import sys import json import re import logging # Set /home/lasse/riksdagen as working directory os.chdir("/home/lasse/riksdagen") # Add the project root to Python path to locate local modules sys.path.append("/home/lasse/riksdagen") from arango_client import arango logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") def clean_text(text: str) -> str: """Cleans text by removing unwanted characters and extra spaces.""" if text is None: return "" text = text.replace("
", "\n").replace("
", "\n").replace("
", "\n") text = text.replace("

", "\n") # Replace multiple newlines with a single newline text = re.sub(r"\n+", "\n", text) text = text.strip() # Remove HTML tags if any text = re.sub(r"<.*?>", "", text) return text def clean_speaker_name(name): if name is None: return "" # Remove parentheses if at the and and the content inside name = name.strip() name = re.sub(r"\s*\(.*?\)\s*$", "", name) return name.strip() def insert_documents_to_arango(docs: list[dict]): for i in range(0, len(docs), 100): batch = docs[i : i + 100] arango_collection.insert_many(batch, overwrite=True) def process_folder(folder_path: str, already_processed: set[str] = {}) -> list[dict]: files_in_folder = os.listdir(folder_path) docs = [] for file in files_in_folder: if file.endswith(".json"): try: with open(os.path.join(folder_path, file), "r", encoding="utf-8-sig") as f: data = json.load(f) doc = data["anforande"] if doc['anforande_id'] in already_processed: continue doc["period"] = int(doc.get("dok_rm", "0000")[:4]) doc.pop("dok_rm", None) doc["anforandetext"] = clean_text(doc.get("anforandetext", "")) doc["talare"] = clean_speaker_name(doc.get("talare", "")) doc["_key"] = doc.get("anforande_id", "") doc.pop("anforande_id", None) doc["datum"] = doc.get("dok_datum", "").split(' ')[0] doc["titel"] = doc.get("dok_titel", "") doc.pop("dok_titel", None) doc["anforande_nummer"] = int(doc.get("anforande_nummer")) doc["hangar_id"] = doc.get("dok_hangar_id", "") doc.pop("dok_hangar_id", None) doc["id"] = doc.get("dok_id", "") doc.pop("dok_id", None) doc.pop("underrubrik", None) doc["replik"] = doc.get("replik", "N") == "Y" doc.pop("systemdatum", None) docs.append(data["anforande"]) except json.JSONDecodeError: logging.warning(f"Skipping invalid JSON file: {file}") except KeyError as e: logging.warning(f"Missing expected key {e} in file: {file}") return docs def update_folder(path: str, already_processed: set[str] = None) -> int: """Updates the ArangoDB collection with documents from the specified folder. Args: path (str): The path to the folder containing JSON files. already_processed (set[str], optional): A set of already processed document keys to avoid duplicates. If None, it will fetch existing keys from the database. Defaults to None. """ if already_processed is None: already_processed = set() cursor = arango.db.aql.execute( """ FOR doc IN talks RETURN doc._key """ ) for doc_key in cursor: already_processed.add(doc_key) folder_docs = process_folder(path, already_processed) insert_documents_to_arango(folder_docs) return len(folder_docs) # Update all folders in talks directory if __name__ == "__main__": already_processed = set() cursor = arango.db.aql.execute( """ FOR doc IN talks RETURN doc._key """ ) for doc_key in cursor: already_processed.add(doc_key) arango_collection = arango.db.collection("talks") folders_in_talks = os.listdir("talks") for folder in folders_in_talks: path = f"/home/lasse/riksdagen/talks/{folder}" if os.path.isdir(path): print(f"Processing folder: {folder}") num_docs = update_folder(path, already_processed) print(f"Inserted/Updated {num_docs} documents from folder {folder}") # Documents look like this: # {'anforande_id': '7f96d4d6-ccd6-ec11-9170-0090facf175a', # 'anforande_nummer': '112', # 'anforandetext': '

Fru talman! Tack, Camilla Waltersson Grönvall, för ditt ' # 'tycker inte att jag fick något tillräckligt bra svar på ' # 'det, fru talman.

', # 'avsnittsrubrik': 'Stöd till personer med funktionsnedsättning', # 'dok_datum': '2022-03-16 00:00:00', # 'dok_hangar_id': '5124227', # 'dok_id': 'H90982', # 'dok_nummer': '82', # 'dok_rm': '2021/22', # 'dok_titel': 'Protokoll 2021/22:82 Onsdagen den 16 mars', # 'intressent_id': '0554852839719', # 'kammaraktivitet': 'ärendedebatt', # 'parti': 'S', # 'rel_dok_id': 'H901SoU12', # 'replik': 'Y', # 'systemdatum': '2022-05-18 19:06:57', # 'talare': 'Mikael Dahlqvist (S)', # 'underrubrik': ''} # 'underrubrik': ''}