import re import requests from bs4 import BeautifulSoup from arango_things import arango_db from datetime import datetime, timedelta def main(url, date): url = url.replace("-TOC_EN.html", "_EN.xml") response = requests.get(url) if not response.ok: print(response.status_code) else: # Fetch and parse the XML data soup = BeautifulSoup(response.content, "xml") debates = [] for chapter in soup.find_all("CHAPTER"): if "debate" in chapter.find("TL-CHAP", {"VL": "EN"}).text: debates.append(chapter) # Iterate through XML elements and extract required data for chapter in debates: speech_number = 0 for contribution in chapter.find_all("INTERVENTION"): speech_number += 1 speaker = contribution.ORATEUR name = speaker["LIB"].replace(" | ", " ") try: title = ( speaker.find("EMPHAS") .text.replace(". –", "") .replace(".", "") .lower() ) title = re.sub(r"[\W]+$", "", title) except: title = None try: speaker_type = speaker["SPEAKER_TYPE"].lower() except KeyError: speaker_type = title party = speaker["PP"] language = speaker["LG"] mep_id = speaker["MEPID"] text = "\n".join([i.text for i in contribution.find_all("PARA")]) debate = contribution.find_parent("CHAPTER") debate_number = debate["NUMBER"] debate_name = debate.find("TL-CHAP", {"VL": "EN"}).text try: debate_start = debate.find("TL-CHAP", {"VL": "EN"})["VOD-START"] debate_end = debate.find("TL-CHAP", {"VL": "EN"})["VOD-END"] except KeyError: debate_start = None debate_end = None debate_type = debate.find("TL-CHAP", {"VL": "EN"})["TYPE"] debate_id = f"{date}_{chapter['NUMBER']}" speech_id = f"{debate_id}-{speech_number}" if speaker and text: # Insert the data into the SQLite database doc = { "date": date, "name": name, "mep_id": mep_id, "title": title, "speaker_type": speaker_type, "party": party, "debate_id": debate_id, "_key": speech_id, "text": text, "language": language, "debate_number": debate_number, "debate_name": debate_name, "debate_start": debate_start, "debate_end": debate_end, "debate_type": debate_type, "url": url, "speech_number": speech_number, } print(f"-- {speech_id}") arango_db.collection("speeches").insert( doc, overwrite=True, overwrite_mode="update" ) if __name__ == "__main__": # Get list of debates. url = "https://www.europarl.europa.eu/plenary/en/ajax/getSessionCalendar.html?family=CRE&termId=9" calendar = requests.get(url).json()["sessionCalendar"] # Filter on dates. today = datetime.today() date_limit = today - timedelta(days=30) # Look 30 days back. dates_in_db = [ date for date in arango_db.aql.execute( "for doc in speeches return distinct doc.date" ) ] for day in calendar: date_string = f"{day['year']}-{day['month']}-{day['day']}" date_debate = datetime.strptime(date_string, "%Y-%m-%d").date() if any( [date_string in dates_in_db, day["url"] == "", date_debate > today.date()] ): continue elif date_debate > date_limit.date(): print(day["url"]) main(day["url"], date_string)