commit
686dd2ae8f
5 changed files with 1330 additions and 0 deletions
@ -0,0 +1,8 @@ |
|||||||
|
* |
||||||
|
!download_debates.py |
||||||
|
!translate_speeches.py |
||||||
|
!arango_things |
||||||
|
!things |
||||||
|
!streamlit_app_talking_ep.py |
||||||
|
!.gitignore |
||||||
|
!streamlit_info.py |
||||||
@ -0,0 +1,119 @@ |
|||||||
|
import re |
||||||
|
import requests |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
from arango_things import arango_db |
||||||
|
from datetime import datetime, timedelta |
||||||
|
|
||||||
|
|
||||||
|
def main(url, date): |
||||||
|
url = url.replace("-TOC_EN.html", "_EN.xml") |
||||||
|
|
||||||
|
response = requests.get(url) |
||||||
|
|
||||||
|
if not response.ok: |
||||||
|
print(response.status_code) |
||||||
|
else: |
||||||
|
# Fetch and parse the XML data |
||||||
|
soup = BeautifulSoup(response.content, "xml") |
||||||
|
debates = [] |
||||||
|
for chapter in soup.find_all("CHAPTER"): |
||||||
|
if "debate" in chapter.find("TL-CHAP", {"VL": "EN"}).text: |
||||||
|
debates.append(chapter) |
||||||
|
|
||||||
|
# Iterate through XML elements and extract required data |
||||||
|
for chapter in debates: |
||||||
|
speech_number = 0 |
||||||
|
for contribution in chapter.find_all("INTERVENTION"): |
||||||
|
speech_number += 1 |
||||||
|
speaker = contribution.ORATEUR |
||||||
|
|
||||||
|
name = speaker["LIB"].replace(" | ", " ") |
||||||
|
|
||||||
|
try: |
||||||
|
title = ( |
||||||
|
speaker.find("EMPHAS") |
||||||
|
.text.replace(". –", "") |
||||||
|
.replace(".", "") |
||||||
|
.lower() |
||||||
|
) |
||||||
|
title = re.sub(r"[\W]+$", "", title) |
||||||
|
except: |
||||||
|
title = None |
||||||
|
try: |
||||||
|
speaker_type = speaker["SPEAKER_TYPE"].lower() |
||||||
|
except KeyError: |
||||||
|
speaker_type = title |
||||||
|
|
||||||
|
party = speaker["PP"] |
||||||
|
language = speaker["LG"] |
||||||
|
mep_id = speaker["MEPID"] |
||||||
|
|
||||||
|
text = "\n".join([i.text for i in contribution.find_all("PARA")]) |
||||||
|
debate = contribution.find_parent("CHAPTER") |
||||||
|
debate_number = debate["NUMBER"] |
||||||
|
debate_name = debate.find("TL-CHAP", {"VL": "EN"}).text |
||||||
|
try: |
||||||
|
debate_start = debate.find("TL-CHAP", {"VL": "EN"})["VOD-START"] |
||||||
|
debate_end = debate.find("TL-CHAP", {"VL": "EN"})["VOD-END"] |
||||||
|
except KeyError: |
||||||
|
debate_start = None |
||||||
|
debate_end = None |
||||||
|
debate_type = debate.find("TL-CHAP", {"VL": "EN"})["TYPE"] |
||||||
|
|
||||||
|
debate_id = f"{date}_{chapter['NUMBER']}" |
||||||
|
speech_id = f"{debate_id}-{speech_number}" |
||||||
|
|
||||||
|
if speaker and text: |
||||||
|
# Insert the data into the SQLite database |
||||||
|
doc = { |
||||||
|
"date": date, |
||||||
|
"name": name, |
||||||
|
"mep_id": mep_id, |
||||||
|
"title": title, |
||||||
|
"speaker_type": speaker_type, |
||||||
|
"party": party, |
||||||
|
"debate_id": debate_id, |
||||||
|
"_key": speech_id, |
||||||
|
"text": text, |
||||||
|
"language": language, |
||||||
|
"debate_number": debate_number, |
||||||
|
"debate_name": debate_name, |
||||||
|
"debate_start": debate_start, |
||||||
|
"debate_end": debate_end, |
||||||
|
"debate_type": debate_type, |
||||||
|
"url": url, |
||||||
|
"speech_number": speech_number, |
||||||
|
} |
||||||
|
|
||||||
|
print(f"-- {speech_id}") |
||||||
|
arango_db.collection("speeches").insert( |
||||||
|
doc, overwrite=True, overwrite_mode="update" |
||||||
|
) |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": |
||||||
|
# Get list of debates. |
||||||
|
url = "https://www.europarl.europa.eu/plenary/en/ajax/getSessionCalendar.html?family=CRE&termId=9" |
||||||
|
calendar = requests.get(url).json()["sessionCalendar"] |
||||||
|
|
||||||
|
# Filter on dates. |
||||||
|
today = datetime.today() |
||||||
|
date_limit = today - timedelta(days=30) # Look 30 days back. |
||||||
|
dates_in_db = [ |
||||||
|
date |
||||||
|
for date in arango_db.aql.execute( |
||||||
|
"for doc in speeches return distinct doc.date" |
||||||
|
) |
||||||
|
] |
||||||
|
|
||||||
|
for day in calendar: |
||||||
|
date_string = f"{day['year']}-{day['month']}-{day['day']}" |
||||||
|
date_debate = datetime.strptime(date_string, "%Y-%m-%d").date() |
||||||
|
|
||||||
|
if any( |
||||||
|
[date_string in dates_in_db, day["url"] == "", date_debate > today.date()] |
||||||
|
): |
||||||
|
continue |
||||||
|
elif date_debate > date_limit.date(): |
||||||
|
print(day["url"]) |
||||||
|
main(day["url"], date_string) |
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,112 @@ |
|||||||
|
""" Information and constants are put here and imported into app.py. """ |
||||||
|
|
||||||
|
results_limit = 1000 |
||||||
|
|
||||||
|
party_colors = { |
||||||
|
"EPP": "#3399FF", |
||||||
|
"S&D": "#F0001C", |
||||||
|
"Renew": "gold", |
||||||
|
"ID": "#0E408A", |
||||||
|
"Greens/EFA": "#57B45F", |
||||||
|
"ECR": "#196CA8", |
||||||
|
"GUE/NGL": "#B71C1C", |
||||||
|
"NA": "grey", |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
# 50 % lighter party colors. |
||||||
|
party_colors_lighten = party_colors_lighten = { |
||||||
|
"EPP": "#f2f2f2", |
||||||
|
"S&D": "#ffe6e8", |
||||||
|
"Renew": "#f2f2f2", |
||||||
|
"ID": "#d6e6fa", |
||||||
|
"Greens/EFA": "#f2f2f2", |
||||||
|
"ECR": "#e6f0fa", |
||||||
|
"GUE/NGL": "#fce6e6", |
||||||
|
"The Left": "#fce6e6", |
||||||
|
"PPE": "#f2f2f2", |
||||||
|
'NA': "#D3D3D3", |
||||||
|
None: 'white' |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
parties = { |
||||||
|
"EPP": "Group of the European People's Party", |
||||||
|
"S&D": "Group of the Progressive Alliance of Socialists and Democrats", |
||||||
|
"RE": "Renew Europe Group", |
||||||
|
"Greens/EFA": "Group of the Greens/European Free Alliance", |
||||||
|
"ID": "Identity and Democracy Group", |
||||||
|
"ECR": "European Conservatives and Reformists Group", |
||||||
|
"GUE/NGL": "Group of the European United Left - Nordic Green Left", |
||||||
|
"NA": "Non-attached Members", |
||||||
|
"Renew": "Renew Europe Group", |
||||||
|
} |
||||||
|
|
||||||
|
|
||||||
|
css = """ <style> |
||||||
|
a:link { |
||||||
|
color: black; |
||||||
|
} |
||||||
|
a:visited { |
||||||
|
color: black; |
||||||
|
} |
||||||
|
a:hover { |
||||||
|
color: grey; |
||||||
|
} |
||||||
|
""" |
||||||
|
for p, c in party_colors.items(): # TODO Update for EU |
||||||
|
if p == "NYD": |
||||||
|
c = "#FFC000" |
||||||
|
if p == "SD": |
||||||
|
c = "#E5AC00" |
||||||
|
if p in ["", "-"]: |
||||||
|
c = "black" |
||||||
|
css += f"\n.{p} a{{color: {c};}}" |
||||||
|
css += "\n</style>" |
||||||
|
|
||||||
|
summary_note = """Below is a summary of what parliamentarians from the different parties has said. |
||||||
|
It's a summary of the ten most relevant speeches from each party based on the search you made. It's generated with a language model and not always accutarate. |
||||||
|
Please make sure to check the original text before you use the summary in any way. |
||||||
|
""" |
||||||
|
explainer = """This is a database of what members of the European Parliamen have said in various debates in the parliament since 2019. |
||||||
|
The data comes partly from the EU. |
||||||
|
- Start by typing one or more keywords below. You can use asterix (*), minus(-), quotation marks (""), OR and year\:yyyy-yyyy. The search |
||||||
|
`energy crisis* basic power OR nuclear power "fossil-free energy sources" -wind power year:2019-2022` is looking for quotes like\: |
||||||
|
- mentions "energy crisis" (incl. e.g. "energy crisis*") |
||||||
|
- mentions either "base power" *or* "nuclear power" |
||||||
|
- mentions the *exact phrase* "fossil-free energy sources" |
||||||
|
- *does* not mention "wind power" |
||||||
|
- found during the years 2019-2022 |
||||||
|
- When you have received your results, you can then click away matches or change which years and debate types you are interested in. |
||||||
|
- Under "Longer excerpt" you can choose to see the entire speech in text, and under the text there are links to the Riksdag's Web TV and downloadable audio (in the cases |
||||||
|
where the debate has been broadcast). |
||||||
|
|
||||||
|
Please tell us how you would like to use the data and about things that don't work. [Email me](mailto:lasse@edfast.se) or [write to me on Twitter](https://twitter.com/lasseedfast). |
||||||
|
My name is [Lasse Edfast and I'm a journalist](https://lasseedfast.se) based in Sweden. |
||||||
|
""" |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
debate_types = { # TODO Update for EU |
||||||
|
"kam-vo": "Beslut", |
||||||
|
"bet": "Debatt om beslut", |
||||||
|
"kam-fs": "Frågestund", |
||||||
|
"kam-ar": "Information från regeringen", |
||||||
|
"ip": "Interpellationsdebatt", |
||||||
|
"kam-sf": "Statsministerns frågestund", |
||||||
|
"sam-ou": "Öppen utfrågning", |
||||||
|
"kam-ad": "Aktuell debatt", |
||||||
|
"kam-al": "Allmänpolitisk debatt", |
||||||
|
"kam-bu": "Budgetdebatt", |
||||||
|
"kam-bp": "Bordläggning", |
||||||
|
"kam-pd": "Partiledardebatt", |
||||||
|
"kam-dv": "Debatt med anledning av vårpropositionen", |
||||||
|
"sam-se": "Öppet seminarium", |
||||||
|
"kam-ud": "Utrikespolitisk debatt", |
||||||
|
} |
||||||
|
|
||||||
|
limit_warning = """ |
||||||
|
Your search returns more than 10,000 hits. Try making it more specific, for example by |
||||||
|
use a minus sign or specify the year by writing year\:yyyy-yyyy (eg year:2019-2020, without a space after the colon). |
||||||
|
The 10,000 hit limit is there to keep the server from crashing and will be raised when I have a stronger server. |
||||||
|
""" |
||||||
@ -0,0 +1,90 @@ |
|||||||
|
from arango_things import arango_db, get_documents |
||||||
|
from sys import argv |
||||||
|
|
||||||
|
from datetime import datetime |
||||||
|
|
||||||
|
from langchain.llms import LlamaCpp |
||||||
|
from langchain.prompts import PromptTemplate |
||||||
|
from langchain.chains import LLMChain |
||||||
|
from langchain.callbacks.manager import CallbackManager |
||||||
|
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler |
||||||
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
||||||
|
|
||||||
|
|
||||||
|
def translate(text, llm): |
||||||
|
template = """ |
||||||
|
You are a professional translator. Only translate, nothing else, and never add anything of your own. |
||||||
|
Translate this text into English. |
||||||
|
|
||||||
|
Text: {text} |
||||||
|
|
||||||
|
Translation: |
||||||
|
""" |
||||||
|
|
||||||
|
prompt = PromptTemplate(template=template, input_variables=["text"], ) |
||||||
|
llm_chain = LLMChain(prompt=prompt, llm=llm) |
||||||
|
|
||||||
|
return llm_chain.run(text) |
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Callbacks support token-wise streaming |
||||||
|
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()]) |
||||||
|
|
||||||
|
|
||||||
|
n_gpu_layers = 80 # Change this value based on your model and your GPU VRAM pool. |
||||||
|
n_batch = 4096 # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU. |
||||||
|
|
||||||
|
|
||||||
|
if len (argv) > 1: |
||||||
|
model = argv[1] |
||||||
|
model_folder = model[:model.rfind('/')] |
||||||
|
model_filename = model[model.rfind('/')+1:] |
||||||
|
model_folder = 'model_files' |
||||||
|
model_filename = 'mistral-7b-openorca.Q4_K_M.gguf' #'mistral-7b-openorca.Q5_K_S.gguf' #'mistral-7b-openorca.Q4_K_M.gguf' |
||||||
|
|
||||||
|
llm = LlamaCpp( |
||||||
|
model_path=f'{model_folder}/{model_filename}', |
||||||
|
n_gpu_layers=n_gpu_layers, |
||||||
|
n_batch=n_batch, |
||||||
|
n_ctx = 4096, |
||||||
|
temperature=0, |
||||||
|
max_tokens = 2500, |
||||||
|
callback_manager=callback_manager, |
||||||
|
verbose=True, # Verbose is required to pass to the callback manager |
||||||
|
) |
||||||
|
|
||||||
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=0) |
||||||
|
|
||||||
|
# Get records without translation. |
||||||
|
query = ''' |
||||||
|
FOR doc IN speeches |
||||||
|
FILTER doc.language != 'EN' |
||||||
|
FILTER CHAR_LENGTH(doc.translation) < 10 |
||||||
|
SORT RAND() |
||||||
|
LIMIT 1 |
||||||
|
RETURN doc |
||||||
|
''' |
||||||
|
|
||||||
|
while True: |
||||||
|
cursor = arango_db.aql.execute(query=query, count=True) |
||||||
|
|
||||||
|
if cursor.count() == 1: |
||||||
|
record = cursor.next() |
||||||
|
else: |
||||||
|
print('Done!') |
||||||
|
break |
||||||
|
|
||||||
|
# Translate using Ollama. |
||||||
|
try: |
||||||
|
print(f'\n\n{record["_key"]}\n') |
||||||
|
|
||||||
|
translation = [] |
||||||
|
splitted_text = text_splitter.split_text(record['text']) |
||||||
|
for text in splitted_text: |
||||||
|
translation.append(translate(text, llm)) |
||||||
|
record['translation'] = ' '.join(translation) |
||||||
|
record['translation_metas'] = {'with': 'LlamaCpp', 'model': model_filename, 'date': datetime.today().strftime('%Y-%m-%d')} |
||||||
|
arango_db.collection("speeches").update(record) |
||||||
|
except: |
||||||
|
pass |
||||||
Loading…
Reference in new issue