You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
201 lines
6.3 KiB
201 lines
6.3 KiB
# takes json from data.riksdagen.se and put it into db. |
|
import json |
|
import multiprocessing |
|
import os |
|
from sqlalchemy.types import Integer |
|
import pandas as pd |
|
import sqlalchemy |
|
from config import db_name |
|
from config import db_user as user |
|
from config import ip_server as ip |
|
from config import pwd_postgres as pwd |
|
from info import months_conversion |
|
|
|
|
|
def datestring_to_date(x): |
|
date_list = x.split(" ") |
|
day = date_list[0] |
|
if len(str(day)) == 1: |
|
day = "0" + day |
|
|
|
return f"{date_list[2]}-{months_conversion[date_list[1]]}-{date_list[0]}" |
|
|
|
|
|
def fix_speaker(y): |
|
if y == None: |
|
y = "" |
|
x = y.lower() |
|
if "ministern" in x: |
|
y = y[x.find("ministern") + 9 :] |
|
elif "minister" in x: |
|
y = y[x.find("minister") + 8 :] |
|
elif "statsrådet" in x: |
|
y = y[x.find("statsrådet") + 10 :] |
|
elif "talman" in x: |
|
y = y[x.find("talman") + 6 :] |
|
return y.title().replace(" Replik", "").strip() |
|
|
|
def get_talks_in_db(): |
|
"""Return a set of all id:s in db.""" |
|
engine = get_engine() |
|
return set(pd.read_sql('select _id from talks', engine)['_id'].tolist()) |
|
|
|
def format_debate(debate): |
|
debate_types = { |
|
"beslut": "kam-vo", |
|
"debatt om beslut": "bet", |
|
"frågestund": "kam-fs", |
|
"information från regeringen": "kam-ar", |
|
"interpellationsdebatt": "ip", |
|
"statsministerns frågestund": "kam-sf", |
|
"öppen utfrågning": "sam-ou", |
|
"aktuell debatt": "kam-ad", |
|
"allmänpolitisk debatt": "kam-al", |
|
"budgetdebatt": "kam-bu", |
|
"bordläggning": "kam-bp", |
|
"partiledardebatt": "kam-pd", |
|
"debatt med anledning av vårpropositionen": "kam-dv", |
|
"öppet seminarium": "sam-se", |
|
"utrikespolitisk debatt": "kam-ud", |
|
} |
|
try: |
|
debate = debate_types[debate] |
|
except KeyError: |
|
pass |
|
return debate |
|
|
|
|
|
def fix_text(text): |
|
try: |
|
text = text.replace("\n", " ").replace(" ", " ").replace('STYLEREF Kantrubrik \* MERGEFORMAT', '') |
|
except: |
|
pass |
|
return text |
|
|
|
def get_talk(d, doc, f): |
|
return doc["dokumentstatus"]['debatt']['anforande'][int(d["anforande_nummer"])-f] |
|
|
|
|
|
def d2df(file): |
|
with open(f'talks/{file}', encoding="utf-8-sig") as f: |
|
d = json.load(f)["anforande"] |
|
d['datum'] = str(d['dok_datum'])[:10] |
|
d['year'] = d['datum'][:4] |
|
d['_id'] = file.replace('.json', '') |
|
del d['dok_datum'] |
|
|
|
# Open the corresponding document with info on webb-TV |
|
if "rel_dok_id" in d: |
|
rel_dok = d["rel_dok_id"] |
|
if rel_dok not in [None, '']: |
|
|
|
if ',' in rel_dok: |
|
rel_dok = rel_dok.split(',')[0] #! Använd bara den första, går att göra bättre? |
|
|
|
try: |
|
with open(f'documents/{rel_dok}.json'.lower(), encoding="utf-8-sig") as f: |
|
doc = json.load(f) |
|
except json.decoder.JSONDecodeError: |
|
doc = {"dokumentstatus": []} |
|
except FileNotFoundError: |
|
doc = {"dokumentstatus": []} |
|
|
|
if 'debatt' in doc["dokumentstatus"]: |
|
try: |
|
if doc["dokumentstatus"]['debatt'] != None: |
|
first_talk_number = int(doc["dokumentstatus"]['debatt']['anforande'][0]["anf_nummer"]) |
|
if 'webbmedia' in doc["dokumentstatus"]: |
|
d["audiofileurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["audiofileurl"] |
|
d["debateurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["debateurl"] |
|
try: |
|
d["kon"] = get_talk(d, doc, first_talk_number)["kon"] |
|
d["debatt_id"] = get_talk(d, doc, first_talk_number)["debatt_id"] |
|
d["anf_sekunder"] = get_talk(d, doc, first_talk_number)["anf_sekunder"] |
|
d["startpos"] = get_talk(d, doc, first_talk_number)["startpos"] |
|
except IndexError: |
|
pass |
|
except: |
|
pass |
|
|
|
# Fill in missing keys. |
|
for i in [ |
|
"audiofileurl", |
|
"debateurl", |
|
'anf_sekunder', |
|
"startpos", |
|
"debatt_id", |
|
"rel_dok_id" |
|
"kon" |
|
]: |
|
if i not in d: |
|
d[i] = '' |
|
|
|
# Make and return DF. |
|
return pd.DataFrame(d, index = [file]) |
|
|
|
def get_ids(): |
|
try: |
|
df = pd.read_sql("select distinct id_session from talks", db.engine) |
|
ids = set(df["id_long"].tolist()) |
|
except: |
|
ids = [] |
|
return ids |
|
|
|
def get_engine(): |
|
engine = sqlalchemy.create_engine( |
|
f"postgresql://{user}:{pwd}@{ip}:5432/riksdagen" |
|
) |
|
return engine |
|
|
|
def upload2db(df, if_exists='append'): |
|
engine = get_engine() |
|
df.to_sql( |
|
db_name, |
|
engine, |
|
dtype={ |
|
"year": Integer(), |
|
"dok_nummer": Integer(), |
|
"anforande_nummer": Integer(), |
|
# 'anf_sekunder': Integer(), |
|
# "startpos": Integer() |
|
}, |
|
chunksize=5000, |
|
method="multi", |
|
if_exists=if_exists, |
|
index=False |
|
) |
|
|
|
def df_from_files(files, test=False): |
|
|
|
if test: |
|
files = files[:1000] |
|
|
|
df_list = [] |
|
with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool: |
|
df_list = pool.map(d2df, files) |
|
df = pd.concat(df_list) |
|
df.fillna('', inplace=True) |
|
df.drop_duplicates(inplace=True, ignore_index=True) |
|
df["anforandetext"] = df["anforandetext"].apply(lambda x: fix_text(x)) |
|
df["anforandetext"].fillna('""', inplace=True) |
|
df["anforandetext"] = df["anforandetext"].apply(lambda x: x.strip()) |
|
df["text_lower"] = df.anforandetext.apply(lambda x: x.lower()) |
|
#df["date_"] = df["date"].apply(lambda x: str(x.to_pydatetime())[:10]) |
|
df["talare"] = df["talare"].apply(lambda x: fix_speaker(x)) |
|
df["anforande_nummer"] = df["anforande_nummer"].apply(lambda x: int(x)) |
|
df['parti'] = df['parti'].apply(lambda x: x.upper().strip()) |
|
|
|
return df |
|
|
|
if __name__ == "__main__": |
|
|
|
files = [] |
|
for file in os.listdir("talks"): |
|
if ".json" not in file: |
|
continue |
|
files.append(file) |
|
|
|
df = df_from_files(files) |
|
upload = input('Upload and overwrite db? ') |
|
if upload in ['y', 'yes']: |
|
upload2db(df, if_exists='overwrite')
|
|
|