You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
149 lines
5.3 KiB
149 lines
5.3 KiB
import os |
|
import sys |
|
import json |
|
import re |
|
import logging |
|
|
|
# Set /home/lasse/riksdagen as working directory |
|
os.chdir("/home/lasse/riksdagen") |
|
# Add the project root to Python path to locate local modules |
|
sys.path.append("/home/lasse/riksdagen") |
|
from arango_client import arango |
|
|
|
logging.basicConfig(level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s") |
|
|
|
|
|
|
|
def clean_text(text: str) -> str: |
|
"""Cleans text by removing unwanted characters and extra spaces.""" |
|
if text is None: |
|
return "" |
|
text = text.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n") |
|
text = text.replace("</p>", "\n") |
|
# Replace multiple newlines with a single newline |
|
text = re.sub(r"\n+", "\n", text) |
|
text = text.strip() |
|
# Remove HTML tags if any |
|
text = re.sub(r"<.*?>", "", text) |
|
return text |
|
|
|
|
|
def clean_speaker_name(name): |
|
if name is None: |
|
return "" |
|
# Remove parentheses if at the and and the content inside |
|
name = name.strip() |
|
name = re.sub(r"\s*\(.*?\)\s*$", "", name) |
|
return name.strip() |
|
|
|
|
|
|
|
|
|
def insert_documents_to_arango(docs: list[dict]): |
|
for i in range(0, len(docs), 100): |
|
batch = docs[i : i + 100] |
|
arango_collection.insert_many(batch, overwrite=True) |
|
|
|
|
|
def process_folder(folder_path: str, already_processed: set[str] = {}) -> list[dict]: |
|
files_in_folder = os.listdir(folder_path) |
|
docs = [] |
|
for file in files_in_folder: |
|
if file.endswith(".json"): |
|
try: |
|
with open(os.path.join(folder_path, file), "r", encoding="utf-8-sig") as f: |
|
data = json.load(f) |
|
doc = data["anforande"] |
|
if doc['anforande_id'] in already_processed: |
|
continue |
|
doc["period"] = int(doc.get("dok_rm", "0000")[:4]) |
|
doc.pop("dok_rm", None) |
|
doc["anforandetext"] = clean_text(doc.get("anforandetext", "")) |
|
doc["talare"] = clean_speaker_name(doc.get("talare", "")) |
|
doc["_key"] = doc.get("anforande_id", "") |
|
doc.pop("anforande_id", None) |
|
doc["datum"] = doc.get("dok_datum", "").split(' ')[0] |
|
doc["titel"] = doc.get("dok_titel", "") |
|
doc.pop("dok_titel", None) |
|
doc["anforande_nummer"] = int(doc.get("anforande_nummer")) |
|
doc["hangar_id"] = doc.get("dok_hangar_id", "") |
|
doc.pop("dok_hangar_id", None) |
|
doc["id"] = doc.get("dok_id", "") |
|
doc.pop("dok_id", None) |
|
doc.pop("underrubrik", None) |
|
doc["replik"] = doc.get("replik", "N") == "Y" |
|
doc.pop("systemdatum", None) |
|
docs.append(data["anforande"]) |
|
except json.JSONDecodeError: |
|
logging.warning(f"Skipping invalid JSON file: {file}") |
|
except KeyError as e: |
|
logging.warning(f"Missing expected key {e} in file: {file}") |
|
return docs |
|
|
|
|
|
def update_folder(path: str, already_processed: set[str] = None) -> int: |
|
"""Updates the ArangoDB collection with documents from the specified folder. |
|
Args: |
|
path (str): The path to the folder containing JSON files. |
|
already_processed (set[str], optional): A set of already processed document keys to avoid duplicates. |
|
If None, it will fetch existing keys from the database. Defaults to None. |
|
""" |
|
if already_processed is None: |
|
already_processed = set() |
|
cursor = arango.db.aql.execute( |
|
""" |
|
FOR doc IN talks |
|
RETURN doc._key |
|
""" |
|
) |
|
for doc_key in cursor: |
|
already_processed.add(doc_key) |
|
|
|
folder_docs = process_folder(path, already_processed) |
|
insert_documents_to_arango(folder_docs) |
|
return len(folder_docs) |
|
|
|
# Update all folders in talks directory |
|
if __name__ == "__main__": |
|
already_processed = set() |
|
cursor = arango.db.aql.execute( |
|
""" |
|
FOR doc IN talks |
|
RETURN doc._key |
|
""" |
|
) |
|
for doc_key in cursor: |
|
already_processed.add(doc_key) |
|
arango_collection = arango.db.collection("talks") |
|
folders_in_talks = os.listdir("talks") |
|
for folder in folders_in_talks: |
|
path = f"/home/lasse/riksdagen/talks/{folder}" |
|
if os.path.isdir(path): |
|
print(f"Processing folder: {folder}") |
|
num_docs = update_folder(path, already_processed) |
|
print(f"Inserted/Updated {num_docs} documents from folder {folder}") |
|
|
|
|
|
# Documents look like this: |
|
|
|
# {'anforande_id': '7f96d4d6-ccd6-ec11-9170-0090facf175a', |
|
# 'anforande_nummer': '112', |
|
# 'anforandetext': '<p>Fru talman! Tack, Camilla Waltersson Grönvall, för ditt ' |
|
# 'tycker inte att jag fick något tillräckligt bra svar på ' |
|
# 'det, fru talman.</p>', |
|
# 'avsnittsrubrik': 'Stöd till personer med funktionsnedsättning', |
|
# 'dok_datum': '2022-03-16 00:00:00', |
|
# 'dok_hangar_id': '5124227', |
|
# 'dok_id': 'H90982', |
|
# 'dok_nummer': '82', |
|
# 'dok_rm': '2021/22', |
|
# 'dok_titel': 'Protokoll 2021/22:82 Onsdagen den 16 mars', |
|
# 'intressent_id': '0554852839719', |
|
# 'kammaraktivitet': 'ärendedebatt', |
|
# 'parti': 'S', |
|
# 'rel_dok_id': 'H901SoU12', |
|
# 'replik': 'Y', |
|
# 'systemdatum': '2022-05-18 19:06:57', |
|
# 'talare': 'Mikael Dahlqvist (S)', |
|
# 'underrubrik': ''} |
|
# 'underrubrik': ''}
|
|
|