You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
119 lines
4.3 KiB
119 lines
4.3 KiB
import re |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from arango_things import arango_db |
|
from datetime import datetime, timedelta |
|
|
|
|
|
def main(url, date): |
|
url = url.replace("-TOC_EN.html", "_EN.xml") |
|
|
|
response = requests.get(url) |
|
|
|
if not response.ok: |
|
print(response.status_code) |
|
else: |
|
# Fetch and parse the XML data |
|
soup = BeautifulSoup(response.content, "xml") |
|
debates = [] |
|
for chapter in soup.find_all("CHAPTER"): |
|
if "debate" in chapter.find("TL-CHAP", {"VL": "EN"}).text: |
|
debates.append(chapter) |
|
|
|
# Iterate through XML elements and extract required data |
|
for chapter in debates: |
|
speech_number = 0 |
|
for contribution in chapter.find_all("INTERVENTION"): |
|
speech_number += 1 |
|
speaker = contribution.ORATEUR |
|
|
|
name = speaker["LIB"].replace(" | ", " ") |
|
|
|
try: |
|
title = ( |
|
speaker.find("EMPHAS") |
|
.text.replace(". –", "") |
|
.replace(".", "") |
|
.lower() |
|
) |
|
title = re.sub(r"[\W]+$", "", title) |
|
except: |
|
title = None |
|
try: |
|
speaker_type = speaker["SPEAKER_TYPE"].lower() |
|
except KeyError: |
|
speaker_type = title |
|
|
|
party = speaker["PP"] |
|
language = speaker["LG"] |
|
mep_id = speaker["MEPID"] |
|
|
|
text = "\n".join([i.text for i in contribution.find_all("PARA")]) |
|
debate = contribution.find_parent("CHAPTER") |
|
debate_number = debate["NUMBER"] |
|
debate_name = debate.find("TL-CHAP", {"VL": "EN"}).text |
|
try: |
|
debate_start = debate.find("TL-CHAP", {"VL": "EN"})["VOD-START"] |
|
debate_end = debate.find("TL-CHAP", {"VL": "EN"})["VOD-END"] |
|
except KeyError: |
|
debate_start = None |
|
debate_end = None |
|
debate_type = debate.find("TL-CHAP", {"VL": "EN"})["TYPE"] |
|
|
|
debate_id = f"{date}_{chapter['NUMBER']}" |
|
speech_id = f"{debate_id}-{speech_number}" |
|
|
|
if speaker and text: |
|
# Insert the data into the SQLite database |
|
doc = { |
|
"date": date, |
|
"name": name, |
|
"mep_id": mep_id, |
|
"title": title, |
|
"speaker_type": speaker_type, |
|
"party": party, |
|
"debate_id": debate_id, |
|
"_key": speech_id, |
|
"text": text, |
|
"language": language, |
|
"debate_number": debate_number, |
|
"debate_name": debate_name, |
|
"debate_start": debate_start, |
|
"debate_end": debate_end, |
|
"debate_type": debate_type, |
|
"url": url, |
|
"speech_number": speech_number, |
|
} |
|
|
|
print(f"-- {speech_id}") |
|
arango_db.collection("speeches").insert( |
|
doc, overwrite=True, overwrite_mode="update" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
# Get list of debates. |
|
url = "https://www.europarl.europa.eu/plenary/en/ajax/getSessionCalendar.html?family=CRE&termId=9" |
|
calendar = requests.get(url).json()["sessionCalendar"] |
|
|
|
# Filter on dates. |
|
today = datetime.today() |
|
date_limit = today - timedelta(days=30) # Look 30 days back. |
|
dates_in_db = [ |
|
date |
|
for date in arango_db.aql.execute( |
|
"for doc in speeches return distinct doc.date" |
|
) |
|
] |
|
|
|
for day in calendar: |
|
date_string = f"{day['year']}-{day['month']}-{day['day']}" |
|
date_debate = datetime.strptime(date_string, "%Y-%m-%d").date() |
|
|
|
if any( |
|
[date_string in dates_in_db, day["url"] == "", date_debate > today.date()] |
|
): |
|
continue |
|
elif date_debate > date_limit.date(): |
|
print(day["url"]) |
|
main(day["url"], date_string)
|
|
|