# takes json from data.riksdagen.se and put it into db. import json import multiprocessing import os from sqlalchemy.types import Integer import pandas as pd import sqlalchemy from config import db_name from config import db_user as user from config import ip_server as ip from config import pwd_postgres as pwd from info import months_conversion def datestring_to_date(x): date_list = x.split(" ") day = date_list[0] if len(str(day)) == 1: day = "0" + day return f"{date_list[2]}-{months_conversion[date_list[1]]}-{date_list[0]}" def fix_speaker(y): if y == None: y = "" x = y.lower() if "ministern" in x: y = y[x.find("ministern") + 9 :] elif "minister" in x: y = y[x.find("minister") + 8 :] elif "statsrådet" in x: y = y[x.find("statsrådet") + 10 :] elif "talman" in x: y = y[x.find("talman") + 6 :] return y.title().replace(" Replik", "").strip() def get_talks_in_db(): """Return a set of all id:s in db.""" engine = get_engine() return set(pd.read_sql('select _id from talks', engine)['_id'].tolist()) def format_debate(debate): debate_types = { "beslut": "kam-vo", "debatt om beslut": "bet", "frågestund": "kam-fs", "information från regeringen": "kam-ar", "interpellationsdebatt": "ip", "statsministerns frågestund": "kam-sf", "öppen utfrågning": "sam-ou", "aktuell debatt": "kam-ad", "allmänpolitisk debatt": "kam-al", "budgetdebatt": "kam-bu", "bordläggning": "kam-bp", "partiledardebatt": "kam-pd", "debatt med anledning av vårpropositionen": "kam-dv", "öppet seminarium": "sam-se", "utrikespolitisk debatt": "kam-ud", } try: debate = debate_types[debate] except KeyError: pass return debate def fix_text(text): try: text = text.replace("\n", " ").replace(" ", " ").replace('STYLEREF Kantrubrik \* MERGEFORMAT', '') except: pass return text def get_talk(d, doc, f): return doc["dokumentstatus"]['debatt']['anforande'][int(d["anforande_nummer"])-f] def d2df(file): with open(f'talks/{file}', encoding="utf-8-sig") as f: d = json.load(f)["anforande"] d['datum'] = str(d['dok_datum'])[:10] d['year'] = d['datum'][:4] d['_id'] = file.replace('.json', '') del d['dok_datum'] # Open the corresponding document with info on webb-TV if "rel_dok_id" in d: rel_dok = d["rel_dok_id"] if rel_dok not in [None, '']: if ',' in rel_dok: rel_dok = rel_dok.split(',')[0] #! Använd bara den första, går att göra bättre? try: with open(f'documents/{rel_dok}.json'.lower(), encoding="utf-8-sig") as f: doc = json.load(f) except json.decoder.JSONDecodeError: doc = {"dokumentstatus": []} except FileNotFoundError: doc = {"dokumentstatus": []} if 'debatt' in doc["dokumentstatus"]: try: if doc["dokumentstatus"]['debatt'] != None: first_talk_number = int(doc["dokumentstatus"]['debatt']['anforande'][0]["anf_nummer"]) if 'webbmedia' in doc["dokumentstatus"]: d["audiofileurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["audiofileurl"] d["debateurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["debateurl"] try: d["kon"] = get_talk(d, doc, first_talk_number)["kon"] d["debatt_id"] = get_talk(d, doc, first_talk_number)["debatt_id"] d["anf_sekunder"] = get_talk(d, doc, first_talk_number)["anf_sekunder"] d["startpos"] = get_talk(d, doc, first_talk_number)["startpos"] except IndexError: pass except: pass # Fill in missing keys. for i in [ "audiofileurl", "debateurl", 'anf_sekunder', "startpos", "debatt_id", "rel_dok_id" "kon" ]: if i not in d: d[i] = '' # Make and return DF. return pd.DataFrame(d, index = [file]) def get_ids(): try: df = pd.read_sql("select distinct id_session from talks", db.engine) ids = set(df["id_long"].tolist()) except: ids = [] return ids def get_engine(): engine = sqlalchemy.create_engine( f"postgresql://{user}:{pwd}@{ip}:5432/riksdagen" ) return engine def upload2db(df, if_exists='append'): engine = get_engine() df.to_sql( db_name, engine, dtype={ "year": Integer(), "dok_nummer": Integer(), "anforande_nummer": Integer(), # 'anf_sekunder': Integer(), # "startpos": Integer() }, chunksize=5000, method="multi", if_exists=if_exists, index=False ) def df_from_files(files, test=False): if test: files = files[:1000] df_list = [] with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool: df_list = pool.map(d2df, files) df = pd.concat(df_list) df.fillna('', inplace=True) df.drop_duplicates(inplace=True, ignore_index=True) df["anforandetext"] = df["anforandetext"].apply(lambda x: fix_text(x)) df["anforandetext"].fillna('""', inplace=True) df["anforandetext"] = df["anforandetext"].apply(lambda x: x.strip()) df["text_lower"] = df.anforandetext.apply(lambda x: x.lower()) #df["date_"] = df["date"].apply(lambda x: str(x.to_pydatetime())[:10]) df["talare"] = df["talare"].apply(lambda x: fix_speaker(x)) df["anforande_nummer"] = df["anforande_nummer"].apply(lambda x: int(x)) df['parti'] = df['parti'].apply(lambda x: x.upper().strip()) return df if __name__ == "__main__": files = [] for file in os.listdir("talks"): if ".json" not in file: continue files.append(file) df = df_from_files(files) upload = input('Upload and overwrite db? ') if upload in ['y', 'yes']: upload2db(df, if_exists='overwrite')