You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.3 KiB

import re
import requests
from bs4 import BeautifulSoup
from arango_things import arango_db
from datetime import datetime, timedelta
def main(url, date):
url = url.replace("-TOC_EN.html", "_EN.xml")
response = requests.get(url)
if not response.ok:
print(response.status_code)
else:
# Fetch and parse the XML data
soup = BeautifulSoup(response.content, "xml")
debates = []
for chapter in soup.find_all("CHAPTER"):
if "debate" in chapter.find("TL-CHAP", {"VL": "EN"}).text:
debates.append(chapter)
# Iterate through XML elements and extract required data
for chapter in debates:
speech_number = 0
for contribution in chapter.find_all("INTERVENTION"):
speech_number += 1
speaker = contribution.ORATEUR
name = speaker["LIB"].replace(" | ", " ")
try:
title = (
speaker.find("EMPHAS")
.text.replace(". –", "")
.replace(".", "")
.lower()
)
title = re.sub(r"[\W]+$", "", title)
except:
title = None
try:
speaker_type = speaker["SPEAKER_TYPE"].lower()
except KeyError:
speaker_type = title
party = speaker["PP"]
language = speaker["LG"]
mep_id = speaker["MEPID"]
text = "\n".join([i.text for i in contribution.find_all("PARA")])
debate = contribution.find_parent("CHAPTER")
debate_number = debate["NUMBER"]
debate_name = debate.find("TL-CHAP", {"VL": "EN"}).text
try:
debate_start = debate.find("TL-CHAP", {"VL": "EN"})["VOD-START"]
debate_end = debate.find("TL-CHAP", {"VL": "EN"})["VOD-END"]
except KeyError:
debate_start = None
debate_end = None
debate_type = debate.find("TL-CHAP", {"VL": "EN"})["TYPE"]
debate_id = f"{date}_{chapter['NUMBER']}"
speech_id = f"{debate_id}-{speech_number}"
if speaker and text:
# Insert the data into the SQLite database
doc = {
"date": date,
"name": name,
"mep_id": mep_id,
"title": title,
"speaker_type": speaker_type,
"party": party,
"debate_id": debate_id,
"_key": speech_id,
"text": text,
"language": language,
"debate_number": debate_number,
"debate_name": debate_name,
"debate_start": debate_start,
"debate_end": debate_end,
"debate_type": debate_type,
"url": url,
"speech_number": speech_number,
}
print(f"-- {speech_id}")
arango_db.collection("speeches").insert(
doc, overwrite=True, overwrite_mode="update"
)
if __name__ == "__main__":
# Get list of debates.
url = "https://www.europarl.europa.eu/plenary/en/ajax/getSessionCalendar.html?family=CRE&termId=9"
calendar = requests.get(url).json()["sessionCalendar"]
# Filter on dates.
today = datetime.today()
date_limit = today - timedelta(days=30) # Look 30 days back.
dates_in_db = [
date
for date in arango_db.aql.execute(
"for doc in speeches return distinct doc.date"
)
]
for day in calendar:
date_string = f"{day['year']}-{day['month']}-{day['day']}"
date_debate = datetime.strptime(date_string, "%Y-%m-%d").date()
if any(
[date_string in dates_in_db, day["url"] == "", date_debate > today.date()]
):
continue
elif date_debate > date_limit.date():
print(day["url"])
main(day["url"], date_string)