You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

201 lines
6.3 KiB

# takes json from data.riksdagen.se and put it into db.
import json
import multiprocessing
import os
from sqlalchemy.types import Integer
import pandas as pd
import sqlalchemy
from config import db_name
from config import db_user as user
from config import ip_server as ip
from config import pwd_postgres as pwd
from info import months_conversion
def datestring_to_date(x):
date_list = x.split(" ")
day = date_list[0]
if len(str(day)) == 1:
day = "0" + day
return f"{date_list[2]}-{months_conversion[date_list[1]]}-{date_list[0]}"
def fix_speaker(y):
if y == None:
y = ""
x = y.lower()
if "ministern" in x:
y = y[x.find("ministern") + 9 :]
elif "minister" in x:
y = y[x.find("minister") + 8 :]
elif "statsrådet" in x:
y = y[x.find("statsrådet") + 10 :]
elif "talman" in x:
y = y[x.find("talman") + 6 :]
return y.title().replace(" Replik", "").strip()
def get_talks_in_db():
"""Return a set of all id:s in db."""
engine = get_engine()
return set(pd.read_sql('select _id from talks', engine)['_id'].tolist())
def format_debate(debate):
debate_types = {
"beslut": "kam-vo",
"debatt om beslut": "bet",
"frågestund": "kam-fs",
"information från regeringen": "kam-ar",
"interpellationsdebatt": "ip",
"statsministerns frågestund": "kam-sf",
"öppen utfrågning": "sam-ou",
"aktuell debatt": "kam-ad",
"allmänpolitisk debatt": "kam-al",
"budgetdebatt": "kam-bu",
"bordläggning": "kam-bp",
"partiledardebatt": "kam-pd",
"debatt med anledning av vårpropositionen": "kam-dv",
"öppet seminarium": "sam-se",
"utrikespolitisk debatt": "kam-ud",
}
try:
debate = debate_types[debate]
except KeyError:
pass
return debate
def fix_text(text):
try:
text = text.replace("\n", " ").replace(" ", " ").replace('STYLEREF Kantrubrik \* MERGEFORMAT', '')
except:
pass
return text
def get_talk(d, doc, f):
return doc["dokumentstatus"]['debatt']['anforande'][int(d["anforande_nummer"])-f]
def d2df(file):
with open(f'talks/{file}', encoding="utf-8-sig") as f:
d = json.load(f)["anforande"]
d['datum'] = str(d['dok_datum'])[:10]
d['year'] = d['datum'][:4]
d['_id'] = file.replace('.json', '')
del d['dok_datum']
# Open the corresponding document with info on webb-TV
if "rel_dok_id" in d:
rel_dok = d["rel_dok_id"]
if rel_dok not in [None, '']:
if ',' in rel_dok:
rel_dok = rel_dok.split(',')[0] #! Använd bara den första, går att göra bättre?
try:
with open(f'documents/{rel_dok}.json'.lower(), encoding="utf-8-sig") as f:
doc = json.load(f)
except json.decoder.JSONDecodeError:
doc = {"dokumentstatus": []}
except FileNotFoundError:
doc = {"dokumentstatus": []}
if 'debatt' in doc["dokumentstatus"]:
try:
if doc["dokumentstatus"]['debatt'] != None:
first_talk_number = int(doc["dokumentstatus"]['debatt']['anforande'][0]["anf_nummer"])
if 'webbmedia' in doc["dokumentstatus"]:
d["audiofileurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["audiofileurl"]
d["debateurl"] = doc["dokumentstatus"]["webbmedia"]["media"]["debateurl"]
try:
d["kon"] = get_talk(d, doc, first_talk_number)["kon"]
d["debatt_id"] = get_talk(d, doc, first_talk_number)["debatt_id"]
d["anf_sekunder"] = get_talk(d, doc, first_talk_number)["anf_sekunder"]
d["startpos"] = get_talk(d, doc, first_talk_number)["startpos"]
except IndexError:
pass
except:
pass
# Fill in missing keys.
for i in [
"audiofileurl",
"debateurl",
'anf_sekunder',
"startpos",
"debatt_id",
"rel_dok_id"
"kon"
]:
if i not in d:
d[i] = ''
# Make and return DF.
return pd.DataFrame(d, index = [file])
def get_ids():
try:
df = pd.read_sql("select distinct id_session from talks", db.engine)
ids = set(df["id_long"].tolist())
except:
ids = []
return ids
def get_engine():
engine = sqlalchemy.create_engine(
f"postgresql://{user}:{pwd}@{ip}:5432/riksdagen"
)
return engine
def upload2db(df, if_exists='append'):
engine = get_engine()
df.to_sql(
db_name,
engine,
dtype={
"year": Integer(),
"dok_nummer": Integer(),
"anforande_nummer": Integer(),
# 'anf_sekunder': Integer(),
# "startpos": Integer()
},
chunksize=5000,
method="multi",
if_exists=if_exists,
index=False
)
def df_from_files(files, test=False):
if test:
files = files[:1000]
df_list = []
with multiprocessing.Pool(multiprocessing.cpu_count()-1) as pool:
df_list = pool.map(d2df, files)
df = pd.concat(df_list)
df.fillna('', inplace=True)
df.drop_duplicates(inplace=True, ignore_index=True)
df["anforandetext"] = df["anforandetext"].apply(lambda x: fix_text(x))
df["anforandetext"].fillna('""', inplace=True)
df["anforandetext"] = df["anforandetext"].apply(lambda x: x.strip())
df["text_lower"] = df.anforandetext.apply(lambda x: x.lower())
#df["date_"] = df["date"].apply(lambda x: str(x.to_pydatetime())[:10])
df["talare"] = df["talare"].apply(lambda x: fix_speaker(x))
df["anforande_nummer"] = df["anforande_nummer"].apply(lambda x: int(x))
df['parti'] = df['parti'].apply(lambda x: x.upper().strip())
return df
if __name__ == "__main__":
files = []
for file in os.listdir("talks"):
if ".json" not in file:
continue
files.append(file)
df = df_from_files(files)
upload = input('Upload and overwrite db? ')
if upload in ['y', 'yes']:
upload2db(df, if_exists='overwrite')