2 years ago · 7a529055e8
parent 9122f0c84f
commit 7a529055e8
3 changed files with 384 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,11 +1,11 @@
 *
+!streamlit_info.py
 !download_debates.py
 !translate_speeches.py
-!arango_things
-!things
+!arango_things.py
+!things.py
 !streamlit_app_talking_ep.py
 !.gitignore
-!streamlit_info.py
 !notes.md
 !llama_server.py
 !requirements_streamlit.txt
--- a/arango_things.py
+++ b/arango_things.py
@ -0,0 +1,68 @@
+from arango import ArangoClient, exceptions
+import pandas as pd
+import yaml
+
+def get_documents(query=False, collection=False, fields=[], filter = '', df=False, index=False, field_names=False):
+    """
+    This function retrieves documents from a specified collection or based on a query in ArangoDB.
+
+    Parameters:
+    query (str): If specified this will be the query. Defaults to False.
+    collection (str): The name of the collection from which to retrieve documents. Defaults to False.
+    fields (list): The fields of the documents to retrieve. If empty, all fields are retrieved.
+    filter (str): AQL filter to apply to the retrieval. Defaults to no filter.
+    df (bool): If True, the result is returned as a pandas DataFrame. Defaults to False.
+    index (bool): If True and df is True, the DataFrame is indexed. Defaults to False.
+    field_names (dict): If provided, these field names will replace the original field names in the result.
+
+    Returns:
+    list or DataFrame: The retrieved documents as a list of dictionaries or a DataFrame.
+    """
+    if query:
+        pass
+
+    else:
+        if fields == []:
+            return_fields = 'doc'
+        else:
+            fields_dict = {}
+            for field in fields:
+                fields_dict[field] = field
+
+            if field_names:
+                for k, v in field_names.items():
+                    fields_dict[k] = v
+
+            fields_list = [f'{v}: doc.{k}' for k, v in fields_dict.items()]
+            fields_string = ', '.join(fields_list)
+            return_fields = f"{{{fields_string}}}"
+        query = f'''
+            for doc
+            in {collection}
+            {filter}
+            return {return_fields}
+            '''
+    try:
+        cursor = arango_db.aql.execute(query)
+    except exceptions.AQLQueryExecuteError:
+        print('ERROR:\n', query)
+        exit()
+    result = [i for i in cursor]
+    if df:
+        result = pd.DataFrame(result)
+        if index:
+            result.set_index(index, inplace=True)
+    return result
+
+
+with open('config.yml', 'r') as f:
+    config = yaml.safe_load(f)
+
+db = config['arango']['db']
+username = config['arango']['username']
+pwd = config['arango']['pwd_lasse']
+
+# Initialize the database for ArangoDB.
+client = ArangoClient(hosts=config['arango']['hosts'])
+arango_db = client.db(db, username=username, password=pwd)
+
--- a/things.py
+++ b/things.py
@ -0,0 +1,313 @@
+import nltk
+import tiktoken
+import re
+
+def normalize_party_names(name):
+    """
+    Normalizes party names to the format used in the database.
+
+    Parameters:
+    name (str): The party name to be normalized.
+
+    Returns:
+    str: The normalized party name.
+    """
+
+    parties = {
+        "EPP": "EPP",
+        "PPE": "EPP",
+        "RE": "Renew",
+        "S-D": "S&D",
+        "S&D": "S&D",
+        "ID": "ID",
+        "ECR": "ECR",
+        "GUE/NGL": "GUE/NGL",
+        "The Left": "GUE/NGL",
+        "Greens/EFA": "Greens/EFA",
+        "G/EFA": "Greens/EFA",
+        "Verts/ALE": "Greens/EFA",
+        "NA": "NA",
+        "NULL": "NA",
+        None: "NA",
+        "-": "NA",
+        "Vacant": "NA",
+        "NI": "NA",
+        "Renew": "Renew"
+
+    }
+
+    return parties[name]
+
+
+def count_tokens(string: str) -> int:
+    """Returns the number of tokens in a text string."""
+    encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+def whitespace_remover(text):
+    return re.sub(r'^[\w\s]', '', text)
+
+def fix_date(string, pattern):
+    """
+    Args:
+        string (str): The input string containing the date.
+        pattern (str): The pattern to be used for parsing the date from the string. It should contain 'y', 'm', and 'd' to represent the year, month, and day respectively.
+
+    Returns:
+        str: The formatted date string in the 'yyyy-mm-dd' format.
+
+    Example:
+        >>> fix_date('20211231', 'yyyymmdd')
+        '2021-12-31'
+    """
+
+    y = re.search(r'y+', pattern)
+    m = re.search(r'm+', pattern)
+    d = re.search(r'd+', pattern)
+
+    year = string[y.span()[0]:y.span()[1]]
+    month = string[m.span()[0]:m.span()[1]]
+    day = string[d.span()[0]:d.span()[1]]
+
+    return (f'{year}-{month}-{day}')
+
+
+def fix_doc_name(string, pattern):
+    """ 
+    Returns a dictionary with keys 'year', 'number', and 'letters', and their corresponding values from the string.
+
+    Args: 
+        string (str): The string from which to extract the year, number, and letters. 
+        pattern (str): The pattern which to search for in the string.
+
+    Returns: 
+        dict: A dictionary with keys 'year', 'number', and 'letters' and their corresponding values from the string. 
+    
+    Example:
+        >>>  fix_doc_name('COM/2021/570', 'lll/yyyy/nnn')
+        '{'year': 2023, 'number': 570, 'letters': COM}'
+    """
+    
+    # Find positions for y, n l.
+    y = re.search(r'y+', pattern)
+    n = re.search(r'n+', pattern)
+    l = re.search(r'l+', pattern)
+
+    # Extract the year, number and letters based on the positions.
+    year = string[y.span()[0]:y.span()[1]]
+    number = string[n.span()[0]:n.span()[1]]
+    letters = string[l.span()[0]:l.span()[1]]
+
+    return {'year': year, 'number': number, 'letters': letters}
+
+
+
+def text_splitter(text: str, max_tokens=2000):
+    """
+    Splits a given text into chunks of sentences where each chunk has a number of tokens less than or equal to the max_tokens.
+
+    The function first calculates the total number of tokens in the input text. If this number is greater than max_tokens, 
+    it calculates the maximum number of tokens per chunk and splits the text into sentences. Then it iterates over the sentences, 
+    adding them to the current chunk until the number of tokens in the current chunk reaches the maximum limit. When this limit 
+    is reached, the current chunk is added to the list of chunks and a new chunk is started. This process continues until all 
+    sentences have been processed. If the total number of tokens in the text is less than or equal to max_tokens, the function 
+    returns the whole text as a single chunk.
+
+    Parameters:
+    text (str): The input text to be split into chunks.
+    max_tokens (int): The maximum number of tokens allowed in each chunk.
+
+    Returns:
+    chunks (list of str): A list of text chunks where each chunk has a number of tokens less than or equal to max_tokens.
+    """
+    try:
+        tokens_in_text = count_tokens(text)
+    except:
+        tokens_in_text = len(text)/3
+    if tokens_in_text > max_tokens:
+
+        # Calculate maximal number of tokens in chunks to make the chunks even.
+        max_tokens_per_chunk = int(tokens_in_text/int(tokens_in_text / max_tokens))
+
+        # Split the text into sentences.
+        sentences = nltk.sent_tokenize(text)
+
+        # Initialize an empty list to hold chunks and a string to hold the current chunk.
+        chunks = []
+        current_chunk = ''
+
+        # Iterate over the sentences.
+        for sentence in sentences:
+            # If adding the next sentence doesn't exceed the max tokens limit, add the sentence to the current chunk.
+            if count_tokens(current_chunk + ' ' + sentence) <= max_tokens_per_chunk:
+                current_chunk += ' ' + sentence
+            else:
+                # If it does, add the current chunk to the chunks list and start a new chunk with the current sentence.
+                chunks.append(current_chunk)
+                current_chunk = sentence
+
+        # Add the last chunk to the chunks list.
+        if current_chunk:
+            chunks.append(current_chunk)
+
+    else:
+        chunks = [text]
+    
+    return chunks
+
+
+parliamentary_term_now = 9 #* Update this every term.
+
+
+model_mistral = "mistral-openorca"
+
+eu_country_codes = {
+    "Belgium": "BE",
+    "Greece": "EL",
+    "Lithuania": "LT",
+    "Portugal": "PT",
+    "Bulgaria": "BG",
+    "Spain": "ES",
+    "Luxembourg": "LU",
+    "Romania": "RO",
+    "Czechia": "CZ",
+    "France": "FR",
+    "Hungary": "HU",
+    "Slovenia": "SI",
+    "Denmark": "DK",
+    "Croatia": "HR",
+    "Malta": "MT",
+    "Slovakia": "SK",
+    "Germany": "DE",
+    "Italy": "IT",
+    "Netherlands": "NL",
+    "Finland": "FI",
+    "Estonia": "EE",
+    "Cyprus": "CY",
+    "Austria": "AT",
+    "Sweden": "SE",
+    "Ireland": "IE",
+    "Latvia": "LV",
+    "Poland": "PL",
+}
+
+country_flags = {
+    "United Kingdom": "🇬🇧",
+    "Sweden": "🇸🇪",
+    "Spain": "🇪🇸",
+    "Slovenia": "🇸🇮",
+    "Slovakia": "🇸🇰",
+    "Romania": "🇷🇴",
+    "Portugal": "🇵🇹",
+    "Poland": "🇵🇱",
+    "Netherlands": "🇳🇱",
+    "Malta": "🇲🇹",
+    "Luxembourg": "🇱🇺",
+    "Lithuania": "🇱🇹",
+    "Latvia": "🇱🇻",
+    "Italy": "🇮🇹",
+    "Ireland": "🇮🇪",
+    "Hungary": "🇭🇺",
+    "Greece": "🇬🇷",
+    "Germany": "🇩🇪",
+    "France": "🇫🇷",
+    "Finland": "🇫🇮",
+    "Estonia": "🇪🇪",
+    "Denmark": "🇩🇰",
+    "Czechia": "🇨🇿",
+    "Cyprus": "🇨🇾",
+    "Croatia": "🇭🇷",
+    "Bulgaria": "🇧🇬",
+    "Belgium": "🇧🇪",
+    "Austria": "🇦🇹",
+}
+
+
+policy_areas = [
+    "Agriculture",
+    "Business",
+    "Industry",
+    "Climate",
+    "Culture",
+    "Customs",
+    "Development",
+    "Education",
+    "Employment",
+    "Social Affairs",
+    "Energy",
+    "Environment",
+    "FoodSafety",
+    "SecurityPolicy",
+    "Health",
+    "Democracy",
+    "Humanitarian Aid",
+    "Justice",
+    "Research And Innovation",
+    "Market",
+    "Taxation",
+    "Trade",
+    "Transport",
+]
+
+# From https://eur-lex.europa.eu/browse/summaries.html
+policy_areas = ['Agriculture', ' Audiovisual and media', ' Budget', ' Competition', ' Consumers', ' Culture', ' Customs', ' Development', ' Digital single market', ' Economic and monetary affairs', ' Education, training, youth, sport', ' Employment and social policy', ' Energy', ' Enlargement', ' Enterprise', ' Environment and climate change', ' External relations', ' External trade', ' Food safety', ' Foreign and security policy', ' Fraud and corruption', ' Humanitarian Aid and Civil Protection', ' Human rights', ' Institutional affairs', ' Internal market', ' Justice, freedom and security', ' Oceans and fisheries', ' Public health', ' Regional policy', ' Research and innovation', ' Taxation', ' Transport']
+
+countries = [
+    "Romania",
+    "Latvia",
+    "Slovenia",
+    "Denmark",
+    "Spain",
+    "Italy",
+    "Hungary",
+    "United Kingdom",
+    "Netherlands",
+    "Czechia",
+    "Finland",
+    "Belgium",
+    "Germany",
+    "France",
+    "Slovakia",
+    "Poland",
+    "Ireland",
+    "Malta",
+    "Cyprus",
+    "Luxembourg",
+    "Greece",
+    "Austria",
+    "Sweden",
+    "Portugal",
+    "Lithuania",
+    "Croatia",
+    "Bulgaria",
+    "Estonia",
+]
+
+parties = ["Renew", "S-D", "PPE", "Verts/ALE", "ECR", "NI", "The Left", "ID", "GUE/NGL"]
+
+party_colors = {
+    "EPP": "#3399FF",
+    "S-D": "#F0001C",
+    "Renew": "gold",
+    "ID": "#0E408A",
+    "G/EFA": "#57B45F",
+    "ECR": "#196CA8",
+    "GUE/NGL": "#B71C1C",
+    'The Left': "#B71C1C", # Same as GUE/NGL
+    "NI": "white",
+    "Vacant": "white",
+    "PPE": "#3399FF", # Same as EPP
+    "NULL": 'white',
+    'Verts/ALE':"#57B45F", # Same as G/EFA
+    None: 'white'
+}
+
+def insert_in_db(query):
+    con = sqlite3.connect(path_db)
+    con.row_factory = sqlite3.Row
+    cursor = con.cursor()
+    query = query
+    cursor.execute(query)
+    con.commit()
+    con.close()