Putting up some here...

2 years ago · 686dd2ae8f
commit 686dd2ae8f
5 changed files with 1330 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
 *
 !download_debates.py
 !translate_speeches.py
 !arango_things
 !things
 !streamlit_app_talking_ep.py
 !.gitignore
 !streamlit_info.py
--- a/download_debates.py
+++ b/download_debates.py
@ -0,0 +1,119 @@
 import re
 import requests
 from bs4 import BeautifulSoup
 from arango_things import arango_db
 from datetime import datetime, timedelta
 def main(url, date):
    url = url.replace("-TOC_EN.html", "_EN.xml")
    response = requests.get(url)
    if not response.ok:
        print(response.status_code)
    else:
        # Fetch and parse the XML data
        soup = BeautifulSoup(response.content, "xml")
        debates = []
        for chapter in soup.find_all("CHAPTER"):
            if "debate" in chapter.find("TL-CHAP", {"VL": "EN"}).text:
                debates.append(chapter)
        # Iterate through XML elements and extract required data
        for chapter in debates:
            speech_number = 0
            for contribution in chapter.find_all("INTERVENTION"):
                speech_number += 1
                speaker = contribution.ORATEUR
                name = speaker["LIB"].replace(" | ", " ")
                try:
                    title = (
                        speaker.find("EMPHAS")
                        .text.replace(". –", "")
                        .replace(".", "")
                        .lower()
                    )
                    title = re.sub(r"[\W]+$", "", title)
                except:
                    title = None
                try:
                    speaker_type = speaker["SPEAKER_TYPE"].lower()
                except KeyError:
                    speaker_type = title
                party = speaker["PP"]
                language = speaker["LG"]
                mep_id = speaker["MEPID"]
                text = "\n".join([i.text for i in contribution.find_all("PARA")])
                debate = contribution.find_parent("CHAPTER")
                debate_number = debate["NUMBER"]
                debate_name = debate.find("TL-CHAP", {"VL": "EN"}).text
                try:
                    debate_start = debate.find("TL-CHAP", {"VL": "EN"})["VOD-START"]
                    debate_end = debate.find("TL-CHAP", {"VL": "EN"})["VOD-END"]
                except KeyError:
                    debate_start = None
                    debate_end = None
                debate_type = debate.find("TL-CHAP", {"VL": "EN"})["TYPE"]
                debate_id = f"{date}_{chapter['NUMBER']}"
                speech_id = f"{debate_id}-{speech_number}"
                if speaker and text:
                    # Insert the data into the SQLite database
                    doc = {
                        "date": date,
                        "name": name,
                        "mep_id": mep_id,
                        "title": title,
                        "speaker_type": speaker_type,
                        "party": party,
                        "debate_id": debate_id,
                        "_key": speech_id,
                        "text": text,
                        "language": language,
                        "debate_number": debate_number,
                        "debate_name": debate_name,
                        "debate_start": debate_start,
                        "debate_end": debate_end,
                        "debate_type": debate_type,
                        "url": url,
                        "speech_number": speech_number,
                    }
                    print(f"-- {speech_id}")
                    arango_db.collection("speeches").insert(
                        doc, overwrite=True, overwrite_mode="update"
                    )
 if __name__ == "__main__":
    # Get list of debates.
    url = "https://www.europarl.europa.eu/plenary/en/ajax/getSessionCalendar.html?family=CRE&termId=9"
    calendar = requests.get(url).json()["sessionCalendar"]
    # Filter on dates.
    today = datetime.today()
    date_limit = today - timedelta(days=30)  # Look 30 days back.
    dates_in_db = [
        date
        for date in arango_db.aql.execute(
            "for doc in speeches return distinct doc.date"
        )
    ]
    for day in calendar:
        date_string = f"{day['year']}-{day['month']}-{day['day']}"
        date_debate = datetime.strptime(date_string, "%Y-%m-%d").date()
        if any(
            [date_string in dates_in_db, day["url"] == "", date_debate > today.date()]
        ):
            continue
        elif date_debate > date_limit.date():
            print(day["url"])
            main(day["url"], date_string)
--- a/streamlit_app_talking_ep.py
+++ b/streamlit_app_talking_ep.py
--- a/streamlit_info.py
+++ b/streamlit_info.py
@ -0,0 +1,112 @@
 """ Information and constants are put here and imported into app.py. """
 results_limit = 1000
 party_colors = {
    "EPP": "#3399FF",
    "S&D": "#F0001C",
    "Renew": "gold",
    "ID": "#0E408A",
    "Greens/EFA": "#57B45F",
    "ECR": "#196CA8",
    "GUE/NGL": "#B71C1C",
    "NA": "grey",
 }
 # 50 % lighter party colors.
 party_colors_lighten = party_colors_lighten = {
    "EPP": "#f2f2f2",
    "S&D": "#ffe6e8",
    "Renew": "#f2f2f2",
    "ID": "#d6e6fa",
    "Greens/EFA": "#f2f2f2",
    "ECR": "#e6f0fa",
    "GUE/NGL": "#fce6e6",
    "The Left": "#fce6e6",
    "PPE": "#f2f2f2",
    'NA': "#D3D3D3",
    None: 'white'
 }
 parties = {
    "EPP": "Group of the European People's Party",
    "S&D": "Group of the Progressive Alliance of Socialists and Democrats",
    "RE": "Renew Europe Group",
    "Greens/EFA": "Group of the Greens/European Free Alliance",
    "ID": "Identity and Democracy Group",
    "ECR": "European Conservatives and Reformists Group",
    "GUE/NGL": "Group of the European United Left - Nordic Green Left",
    "NA": "Non-attached Members",
    "Renew": "Renew Europe Group",
 }
 css = """ <style>
 a:link {
  color: black;
 }
 a:visited {
  color: black;
 }
 a:hover {
  color: grey;
 }
 """
 for p, c in party_colors.items():  # TODO Update for EU
    if p == "NYD":
        c = "#FFC000"
    if p == "SD":
        c = "#E5AC00"
    if p in ["", "-"]:
        c = "black"
    css += f"\n.{p} a{{color: {c};}}"
 css += "\n</style>"
 summary_note = """Below is a summary of what parliamentarians from the different parties has said. 
 It's a summary of the ten most relevant speeches from each party based on the search you made. It's generated with a language model and not always accutarate.
 Please make sure to check the original text before you use the summary in any way.
 """
 explainer = """This is a database of what members of the European Parliamen have said in various debates in the parliament since 2019.
 The data comes partly from the EU.
 - Start by typing one or more keywords below. You can use asterix (*), minus(-), quotation marks (""), OR and year\:yyyy-yyyy. The search
 `energy crisis* basic power OR nuclear power "fossil-free energy sources" -wind power year:2019-2022` is looking for quotes like\:
     - mentions "energy crisis" (incl. e.g. "energy crisis*")
     - mentions either "base power" *or* "nuclear power"
     - mentions the *exact phrase* "fossil-free energy sources"
     - *does* not mention "wind power"
     - found during the years 2019-2022
 - When you have received your results, you can then click away matches or change which years and debate types you are interested in.
 - Under "Longer excerpt" you can choose to see the entire speech in text, and under the text there are links to the Riksdag's Web TV and downloadable audio (in the cases
 where the debate has been broadcast).
 Please tell us how you would like to use the data and about things that don't work. [Email me](mailto:lasse@edfast.se) or [write to me on Twitter](https://twitter.com/lasseedfast).
 My name is [Lasse Edfast and I'm a journalist](https://lasseedfast.se) based in Sweden.
 """
 debate_types = {  # TODO Update for EU
    "kam-vo": "Beslut",
    "bet": "Debatt om beslut",
    "kam-fs": "Frågestund",
    "kam-ar": "Information från regeringen",
    "ip": "Interpellationsdebatt",
    "kam-sf": "Statsministerns frågestund",
    "sam-ou": "Öppen utfrågning",
    "kam-ad": "Aktuell debatt",
    "kam-al": "Allmänpolitisk debatt",
    "kam-bu": "Budgetdebatt",
    "kam-bp": "Bordläggning",
    "kam-pd": "Partiledardebatt",
    "kam-dv": "Debatt med anledning av vårpropositionen",
    "sam-se": "Öppet seminarium",
    "kam-ud": "Utrikespolitisk debatt",
 }
 limit_warning = """
       Your search returns more than 10,000 hits. Try making it more specific, for example by
         use a minus sign or specify the year by writing year\:yyyy-yyyy (eg year:2019-2020, without a space after the colon).
         The 10,000 hit limit is there to keep the server from crashing and will be raised when I have a stronger server.
        """
--- a/translate_speeches.py
+++ b/translate_speeches.py
@ -0,0 +1,90 @@
 from arango_things import arango_db, get_documents
 from sys import argv
 from datetime import datetime
 from langchain.llms import LlamaCpp
 from langchain.prompts import PromptTemplate
 from langchain.chains import LLMChain
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 def translate(text, llm):
    template = """
    You are a professional translator. Only translate, nothing else, and never add anything of your own.
    Translate this text into English.
    Text: {text}
    Translation:
    """
    prompt = PromptTemplate(template=template, input_variables=["text"], )
    llm_chain = LLMChain(prompt=prompt, llm=llm)
    return llm_chain.run(text)
 # Callbacks support token-wise streaming
 callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
 n_gpu_layers = 80  # Change this value based on your model and your GPU VRAM pool.
 n_batch = 4096  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
 if len (argv) > 1:
    model = argv[1]
    model_folder = model[:model.rfind('/')]
    model_filename = model[model.rfind('/')+1:]
 model_folder = 'model_files'
 model_filename = 'mistral-7b-openorca.Q4_K_M.gguf' #'mistral-7b-openorca.Q5_K_S.gguf' #'mistral-7b-openorca.Q4_K_M.gguf'
 llm = LlamaCpp(
    model_path=f'{model_folder}/{model_filename}',
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx = 4096,
    temperature=0,
    max_tokens = 2500,
    callback_manager=callback_manager,
    verbose=True, # Verbose is required to pass to the callback manager
 )
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=0)
 # Get records without translation.
 query = '''
    FOR doc IN speeches
    FILTER doc.language != 'EN'
    FILTER CHAR_LENGTH(doc.translation) < 10
    SORT RAND()
    LIMIT 1
    RETURN doc
    '''
 while True:
    cursor = arango_db.aql.execute(query=query, count=True)
    if cursor.count() == 1:
        record = cursor.next()
    else:
        print('Done!')
        break
    # Translate using Ollama.
    try:
        print(f'\n\n{record["_key"]}\n')
        translation = []
        splitted_text = text_splitter.split_text(record['text'])
        for text in splitted_text:
            translation.append(translate(text, llm))
        record['translation'] = ' '.join(translation)
        record['translation_metas'] = {'with': 'LlamaCpp', 'model': model_filename, 'date': datetime.today().strftime('%Y-%m-%d')}
        arango_db.collection("speeches").update(record)
    except:
        pass