You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

149 lines
5.3 KiB

import os
import sys
import json
import re
import logging
# Set /home/lasse/riksdagen as working directory
os.chdir("/home/lasse/riksdagen")
# Add the project root to Python path to locate local modules
sys.path.append("/home/lasse/riksdagen")
from arango_client import arango
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s")
def clean_text(text: str) -> str:
"""Cleans text by removing unwanted characters and extra spaces."""
if text is None:
return ""
text = text.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n")
text = text.replace("</p>", "\n")
# Replace multiple newlines with a single newline
text = re.sub(r"\n+", "\n", text)
text = text.strip()
# Remove HTML tags if any
text = re.sub(r"<.*?>", "", text)
return text
def clean_speaker_name(name):
if name is None:
return ""
# Remove parentheses if at the and and the content inside
name = name.strip()
name = re.sub(r"\s*\(.*?\)\s*$", "", name)
return name.strip()
def insert_documents_to_arango(docs: list[dict]):
for i in range(0, len(docs), 100):
batch = docs[i : i + 100]
arango_collection.insert_many(batch, overwrite=True)
def process_folder(folder_path: str, already_processed: set[str] = {}) -> list[dict]:
files_in_folder = os.listdir(folder_path)
docs = []
for file in files_in_folder:
if file.endswith(".json"):
try:
with open(os.path.join(folder_path, file), "r", encoding="utf-8-sig") as f:
data = json.load(f)
doc = data["anforande"]
if doc['anforande_id'] in already_processed:
continue
doc["period"] = int(doc.get("dok_rm", "0000")[:4])
doc.pop("dok_rm", None)
doc["anforandetext"] = clean_text(doc.get("anforandetext", ""))
doc["talare"] = clean_speaker_name(doc.get("talare", ""))
doc["_key"] = doc.get("anforande_id", "")
doc.pop("anforande_id", None)
doc["datum"] = doc.get("dok_datum", "").split(' ')[0]
doc["titel"] = doc.get("dok_titel", "")
doc.pop("dok_titel", None)
doc["anforande_nummer"] = int(doc.get("anforande_nummer"))
doc["hangar_id"] = doc.get("dok_hangar_id", "")
doc.pop("dok_hangar_id", None)
doc["id"] = doc.get("dok_id", "")
doc.pop("dok_id", None)
doc.pop("underrubrik", None)
doc["replik"] = doc.get("replik", "N") == "Y"
doc.pop("systemdatum", None)
docs.append(data["anforande"])
except json.JSONDecodeError:
logging.warning(f"Skipping invalid JSON file: {file}")
except KeyError as e:
logging.warning(f"Missing expected key {e} in file: {file}")
return docs
def update_folder(path: str, already_processed: set[str] = None) -> int:
"""Updates the ArangoDB collection with documents from the specified folder.
Args:
path (str): The path to the folder containing JSON files.
already_processed (set[str], optional): A set of already processed document keys to avoid duplicates.
If None, it will fetch existing keys from the database. Defaults to None.
"""
if already_processed is None:
already_processed = set()
cursor = arango.db.aql.execute(
"""
FOR doc IN talks
RETURN doc._key
"""
)
for doc_key in cursor:
already_processed.add(doc_key)
folder_docs = process_folder(path, already_processed)
insert_documents_to_arango(folder_docs)
return len(folder_docs)
# Update all folders in talks directory
if __name__ == "__main__":
already_processed = set()
cursor = arango.db.aql.execute(
"""
FOR doc IN talks
RETURN doc._key
"""
)
for doc_key in cursor:
already_processed.add(doc_key)
arango_collection = arango.db.collection("talks")
folders_in_talks = os.listdir("talks")
for folder in folders_in_talks:
path = f"/home/lasse/riksdagen/talks/{folder}"
if os.path.isdir(path):
print(f"Processing folder: {folder}")
num_docs = update_folder(path, already_processed)
print(f"Inserted/Updated {num_docs} documents from folder {folder}")
# Documents look like this:
# {'anforande_id': '7f96d4d6-ccd6-ec11-9170-0090facf175a',
# 'anforande_nummer': '112',
# 'anforandetext': '<p>Fru talman! Tack, Camilla Waltersson Grönvall, för ditt '
# 'tycker inte att jag fick något tillräckligt bra svar på '
# 'det, fru talman.</p>',
# 'avsnittsrubrik': 'Stöd till personer med funktionsnedsättning',
# 'dok_datum': '2022-03-16 00:00:00',
# 'dok_hangar_id': '5124227',
# 'dok_id': 'H90982',
# 'dok_nummer': '82',
# 'dok_rm': '2021/22',
# 'dok_titel': 'Protokoll 2021/22:82 Onsdagen den 16 mars',
# 'intressent_id': '0554852839719',
# 'kammaraktivitet': 'ärendedebatt',
# 'parti': 'S',
# 'rel_dok_id': 'H901SoU12',
# 'replik': 'Y',
# 'systemdatum': '2022-05-18 19:06:57',
# 'talare': 'Mikael Dahlqvist (S)',
# 'underrubrik': ''}
# 'underrubrik': ''}