From 6d1d97a84963e00c414e6b3c45a9d65f2b859a0a Mon Sep 17 00:00:00 2001
From: lasseedfast <>
Date: Tue, 15 Oct 2024 15:20:28 +0200
Subject: [PATCH] Refactor code structure and remove unused config.json file

---
 _llm.py                |  27 +-
 config.json            |   3 -
 enrich_sci_articles.py | 139 +++++++
 make_summaries.py      |  85 +++++
 parl_meetings.py       | 828 +++++++++++++++++++++++++++++++++++++++++
 test.py                |  10 +
 6 files changed, 1081 insertions(+), 11 deletions(-)
 delete mode 100644 config.json
 create mode 100644 enrich_sci_articles.py
 create mode 100644 make_summaries.py
 create mode 100644 parl_meetings.py
 create mode 100644 test.py

diff --git a/_llm.py b/_llm.py
index 68194f8..60db3f4 100644
--- a/_llm.py
+++ b/_llm.py
@@ -1,26 +1,37 @@
 from ollama import Client
 import os
 import env_manager
+
 env_manager.set_env()
 
+
 class LLM:
-    def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None:
-        self.llm_model = os.getenv("LLM_MODEL")
+    def __init__(
+        self, system_message=None, num_ctx=20000, temperature=0, chat=True
+    ) -> None:
+        self.llm_model = "mistral-nemo:12b-instruct-2407-q5_K_M" #os.getenv("LLM_MODEL")
         self.system_message = system_message
         self.options = {"temperature": temperature, "num_ctx": num_ctx}
-        self.messages = [{'role': 'system', 'content': self.system_message}]
+        self.messages = [{"role": "system", "content": self.system_message}]
         self.chat = chat
-        self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}')
+        self.ollama = Client(
+            host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}', 
+        )
 
     def generate(self, prompt: str) -> str:
         self.messages.append({"role": "user", "content": prompt})
 
-        result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, )
+        result = self.ollama.chat(
+            model=self.llm_model, messages=self.messages, options=self.options
+        )
 
-        answer = result['message']['content']
+        answer = result["message"]["content"]
         self.messages.append({"role": "assistant", "content": answer})
         if not self.chat:
-            self.messages = [{'role': 'system', 'content': self.system_message}]
+            self.messages = [{"role": "system", "content": self.system_message}]
 
         return answer
-    
+
+if __name__ == "__main__":
+    llm = LLM()
+    print(llm.generate("Why is the sky red?"))
\ No newline at end of file
diff --git a/config.json b/config.json
deleted file mode 100644
index 57fc282..0000000
--- a/config.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-    "apikey": "eee5c352c30f3586f1cc42d0a07ce440"
-}
\ No newline at end of file
diff --git a/enrich_sci_articles.py b/enrich_sci_articles.py
new file mode 100644
index 0000000..871a3c2
--- /dev/null
+++ b/enrich_sci_articles.py
@@ -0,0 +1,139 @@
+from _arango import ArangoDB
+from _llm import LLM
+import tiktoken
+from colorprinter.print_color import *
+
+def make_summaries():
+
+    # Initialize the tokenizer
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+
+    def count_tokens(text):
+        tokens = tokenizer.encode(text)
+        return len(tokens)
+
+    articles = [i for i in arango.db.aql.execute('''
+    for doc in sci_articles
+    return doc
+    ''')]
+
+    for article in articles:
+        num_tokens = count_tokens(article["text"])
+        llm = LLM(
+        system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.",
+        num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000,
+        temperature=0,
+    )
+        try:
+            prompt = f'''
+    Make a summary of the following text:
+    """
+    {article["text"]}
+    """
+    Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion. 
+    Everything about electric vehicles, and things related to electric cars, is very important.
+    Write the summary as if you are writing for someone who is not familiar with the topic.
+    Write it from the point of the view of the author of the text.
+    '''
+
+            article["summary"] = {
+                "meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']},
+                "text_sum": llm.generate(prompt),
+            }
+            print(article["summary"])
+            arango.db.collection("sci_articles").update(article)
+        except Exception as e:
+            print(e)
+            article['summary_error'] = str(e)
+            arango.db.collection("sci_articles").update(article)
+            continue
+
+
+def make_chunk_qa(num_qa=5):
+    articles = [i for i in arango.db.aql.execute('''
+    for doc in sci_articles
+    return doc
+    ''')]
+
+    for article in articles:
+        try:
+            if 'abstract' in article['metadata']:
+                abstract = article['metadata']['abstract'].replace('<jats:p', '').replace('</jats:p>', '')
+            else:
+                abstract = article['summary']['text_sum']
+
+            question_machine = LLM(
+            system_message= f'''You are creating questions based on scientific articles. You will be given one text snippet from the article at a time and you should create {num_qa} questions based on that snippet.
+            To understand the article as a whole you can read this abstract:
+            """
+            {abstract}
+            """
+            The {num_qa} questions should be based on the text snippet and should be answerable by the text, but you can check the conversation history to make them more relevant for the context.
+            Don't write general questions like "what is the text about?", but rather questions that reflect the facts in the text.
+            The questions will be used in a CSV file so it's important that you answer on the format: "question1;question2;question3;question4;question5".
+            Always make {num_qa} questions to every text!
+            ''',
+            num_ctx=20000,
+            temperature=0.2,
+        )
+            
+            answer_machine = LLM(
+            system_message=f'''You are answering questions about a text snippet from a scientific article. You will be given one question and one text snippet at a time and you should answer the questions based on that snippet.
+            The answers should be based on the text snippet, but you can check the conversation history to make them more relevant for the context.
+            Answer ONLY with the answer to the question, not a reasoning where you explain why you think that is the answer.
+            Make the answers long enough to be informative and contain relevant information, but not too long. 
+            ''',
+            num_ctx=20000,
+            temperature=0.2,
+        )
+
+            for chunk in article["chunks"]:
+                if 'qa' in chunk:
+                    continue
+                chunk["qa"] = []
+                prompt = f'''
+                """
+                {chunk['text']}
+                """
+                Remember:
+                - If there is something in the text about electric cars, please include that in the question.
+                - Don't write general questinos like "what is the text about?" or "what is the main point of the text?", but rather questions that can be answered by the text. The questions will be used to query a vector database.
+                - Answer on the format: "question1;question2;question3;question4;question5" as the questions will be used in a CSV file. Answer ONLY with the questions, not anything else!
+                '''
+                questions = question_machine.generate(prompt).split(';')
+                for question in questions:
+                    print_blue(question)
+                    if questions.index(question) == 0:
+                        prompt = f'''
+                        Answer the following question based on the text snippet below: {question}
+                        """
+                        {chunk['text']}
+                        """
+                        Remember:
+                        - The answer should be based on the text.  
+                        - If there is something in the text about electric cars, please include that in the answer.
+                        - Answer ONLY with the answer, nothing else. 
+                        '''
+                    else:
+                        prompt = question
+                    
+                    answer = answer_machine.generate(question)
+                    print_green(answer)
+                    qa = {
+                        "question": question,
+                        "answer": answer,
+                    }
+                    chunk["qa"].append(qa)
+
+            arango.db.collection("sci_articles").update(article, check_rev=False)
+        except Exception as e:
+            print(e)
+            article['qa_error'] = str(e)
+            arango.db.collection("sci_articles").update(article, check_rev=False)
+            continue
+
+
+
+if __name__ == "__main__":
+    arango = ArangoDB()
+    make_chunk_qa()
\ No newline at end of file
diff --git a/make_summaries.py b/make_summaries.py
new file mode 100644
index 0000000..8274dad
--- /dev/null
+++ b/make_summaries.py
@@ -0,0 +1,85 @@
+from _arango import ArangoDB
+from _llm import LLM
+import tiktoken
+
+
+def make_summaries():
+
+    # Initialize the tokenizer
+    tokenizer = tiktoken.get_encoding("cl100k_base")
+
+    def count_tokens(text):
+        tokens = tokenizer.encode(text)
+        return len(tokens)
+
+    articles = [i for i in arango.db.aql.execute('''
+    for doc in sci_articles
+    return doc
+    ''')]
+
+    for article in articles:
+        num_tokens = count_tokens(article["text"])
+        llm = LLM(
+        system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.",
+        num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000,
+        temperature=0,
+    )
+        try:
+            prompt = f'''
+    Make a summary of the following text:
+    """
+    {article["text"]}
+    """
+    Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion. 
+    Everything about electric vehicles, and things related to electric cars, is very important.
+    Write the summary as if you are writing for someone who is not familiar with the topic.
+    Write it from the point of the view of the author of the text.
+    '''
+
+            article["summary"] = {
+                "meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']},
+                "text_sum": llm.generate(prompt),
+            }
+            print(article["summary"])
+            arango.db.collection("sci_articles").update(article)
+        except Exception as e:
+            print(e)
+            article['summary_error'] = str(e)
+            arango.db.collection("sci_articles").update(article)
+            continue
+
+
+def make_chunk_qa():
+    articles = [i for i in arango.db.aql.execute('''
+    for doc in sci_articles
+    return doc
+    ''')]
+
+    for article in articles:
+        questin_machine = LLM(
+        system_message="You are creating questions based on scientific articles. The questions should be based on the text and should be answerable by the text, but you can check the conversation history to make them more relevant for the context.",
+        num_ctx=20000,
+        temperature=0.2,
+    )
+        
+        answer_machine = LLM(
+        system_message="You are answering questions based on scientific articles. The answers should be based on the text, but you can check the conversation history to make them more relevant for the context.",
+        num_ctx=20000,
+        temperature=0.2,
+    )
+
+        for chunk in article["chunks"]:
+            prompt = f'''
+            Create a question based on the following text:
+            """
+            {chunk['text']}
+            """
+            Write a question that can be answered by the text. Make sure to include information from all sections: introduction, methods, results, and conclusion. 
+            Everything about electric vehicles, and things related to electric cars, is very important.
+            Write the question as if you are writing for someone who is not familiar with the topic.
+            Write it from the point of the view of the author of the text.
+            '''
+
+
+if __name__ == "__main__":
+    arango = ArangoDB()
\ No newline at end of file
diff --git a/parl_meetings.py b/parl_meetings.py
new file mode 100644
index 0000000..8da001a
--- /dev/null
+++ b/parl_meetings.py
@@ -0,0 +1,828 @@
+from pprint import pprint
+import asyncio
+from pyppeteer import launch
+from bs4 import BeautifulSoup
+from _arango import ArangoDB
+from time import sleep
+from colorprinter.print_color import *
+
+async def get_info(browser, id_number):
+    try:
+        page = await browser.newPage()
+        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
+
+        url = f'https://transparency-register.europa.eu/search-details_en?id={id_number}'
+        await page.goto(url, {'waitUntil': 'networkidle2'})
+        content = await page.content()
+        await page.close()
+
+        soup = BeautifulSoup(content, 'html.parser')
+        info_html = soup.find_all('div', class_='ecl')[0]
+        headers = info_html.find_all('h2')
+        tables = info_html.find_all('table')
+        headers_and_tables = zip(headers, tables)
+
+        data = {}
+        for header, table in headers_and_tables:
+            header_text = header.text.strip()
+            data[header_text] = {}
+            rows = table.find_all('tr')
+            for row in rows:
+                cells = row.find_all('td')
+                if len(cells) == 2:
+                    row_name = cells[0].get_text(strip=True)
+                    cell_content = cells[1]
+                    
+                    # Check if the cell contains a list
+                    ul = cell_content.find('ul')
+                    if ul:
+                        list_items = [li.get_text(strip=True) for li in ul.find_all('li')]
+                        data[header_text][row_name] = {'list': list_items}
+                    else:
+                        cell_text = cell_content.get_text(strip=True)
+                        links = cell_content.find_all('a')
+                        cell_links = {link.get_text(strip=True): link['href'] for link in links}
+                        if cell_links:
+                            data[header_text][row_name] = {'text': cell_text, 'links': cell_links}
+                        else:
+                            data[header_text][row_name] = {'text': cell_text}
+
+        data['html'] = str(info_html)
+        return data
+    except Exception as e:
+        print(f"Error fetching info for ID {id_number}: {e}")
+        return None
+
+
+def update_info_from_html():
+    arango = ArangoDB()
+    arango_docs = [i for i in arango.db.collection('eu_lobbyists').all()]
+    na = len(arango_docs)
+    n=0
+    new_docs = []
+    for doc in arango_docs:
+        n += 1
+        html = doc['html']
+        data = extract_from_html(html, {'html': html})
+        data['_key'] = doc['_key']
+        new_docs.append(data)
+        print(f'{n}/{na}', end='\r')
+    arango.db.collection('eu_lobbyists').insert_many(data, overwrite=True)
+
+
+def extract_from_html(html, data = {}):
+    soup = BeautifulSoup(html, 'html.parser')
+    info_html = soup.find_all('div', class_='ecl')[0]
+    headers = info_html.find_all('h2')
+    tables = info_html.find_all('table', {'class': 'ecl-table ecl-table--zebra'})
+    headers_and_tables = zip(headers, tables)
+
+    for header, table in headers_and_tables:
+        header_text = header.text.strip()
+        if header_text not in data:
+            data[header_text] = {}
+        rows = table.find_all('tr')
+        table_data = {}
+        for row in rows:
+            cells = row.find_all('td')
+            if len(cells) == 2:
+                row_name = cells[0].get_text(strip=True)
+                cell_content = cells[1]
+            elif len(cells) == 1:
+                row_name = header_text
+                cell_content = cells[0]
+            else:
+                continue
+            
+            # Check if the cell contains a list
+            ul = cell_content.find('ul')
+            if ul:
+                list_items = [li.get_text(strip=True) for li in ul.find_all('li')]
+                table_data[row_name] = list_items
+                if header_text == row_name:
+                    table_data = list_items
+            else:
+                cell_text = cell_content.get_text(strip=True)
+                links = cell_content.find_all('a')
+                cell_links = {link.get_text(strip=True): link['href'] for link in links}
+                if cell_links:
+                    table_data[row_name] = {'text': cell_text, 'links': cell_links}
+                else:
+                    table_data[row_name] = {'text': cell_text}
+        for k, v in table_data.items():
+            data[header_text][k] = v
+    return data
+
+
+
+
+async def get_all_lobbyists(browser, page_number):
+    try:
+        page = await browser.newPage()
+        await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
+
+        url = f'https://transparency-register.europa.eu/searchregister-or-update/search-register_en?searchType=REGISTRANTS&page={page_number}#list-of-all-lobbyists'
+        await page.goto(url, {'waitUntil': 'networkidle2'})
+        content = await page.content()
+        await page.close()
+
+        soup = BeautifulSoup(content, 'html.parser')
+        links = soup.find_all('a', class_='ecl-link')
+        ids = [link['href'].split('=')[-1] for link in links if 'search-details' in link['href']]
+        return ids
+    except Exception as e:
+        print(f"Error fetching lobbyists for page {page_number}: {e}")
+        return []
+
+async def main():
+    arango = ArangoDB()
+    if not arango.db.has_collection('eu_lobbyists'):
+        arango.db.create_collection('eu_lobbyists')
+    #arango.db.collection('eu_lobbyists').truncate()
+
+    browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
+
+    try:
+        for page_number in range(1, 148):
+            sleep(1.3)
+            ids = await get_all_lobbyists(browser, page_number)
+            tasks = []
+            for id_number in ids:
+                if not arango.db.collection('eu_lobbyists').get(id_number):
+                    tasks.append(get_info(browser, id_number))
+                    sleep(1.6)
+
+            results = await asyncio.gather(*tasks)
+            for id_number, data in zip(ids, results):
+                if data:
+                    data['_key'] = id_number
+                    if 'Profile of registrant' in data:
+                        arango.db.collection('eu_lobbyists').insert(data, overwrite=True)
+                        print(f'Inserted {id_number}')
+                    else:
+                        print(f'"Profile of registrant" not in {id_number}')
+    finally:
+        await browser.close()
+
+if __name__ == '__main__':
+
+    html = '''
+    <div class="ecl">
+<div>
+	
+	
+
+	<h2 id="profile-of-registrant">Profile of registrant</h2>
+	<table class="ecl-table ecl-table--zebra">
+
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Organisation name</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<strong>Associação Portuguesa para o Desenvolvimento Local</strong>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>REG Number</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>500479542151-73</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Status</strong>:
+				</td>
+				<td class="ecl-table__cell">Activated</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Registration date</strong>:
+				</td>
+				<td class="ecl-table__cell">07/04/2021 19:44:13</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>The registrant performed the last (partial or annual) update on</strong>:
+				</td>
+				<td class="ecl-table__cell">29/02/2024 13:18:08</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Next annual update due latest on</strong>:
+				</td>
+				<td class="ecl-table__cell">28/02/2025</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="applicantregistrant-organisation-or-self-employed-individuals"> Applicant/registrant: organisation or self-employed individuals</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Organisation name</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Associação Portuguesa para o Desenvolvimento Local</span>
+				</td>
+			</tr>
+			
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Acronym</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>ANIMAR</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Form of entity</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Associação Sem Fim Lucrativo</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Website</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<a href="http://animar-dl.pt" target="_blank">http://animar-dl.pt</a>
+					
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="contact-details">Contact details</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td colspan="2" class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Contact details of your organisation's head office</strong>:
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Address</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Av. Santos Dumont, 57 - 1º Esq. Avenidas Novas</span>
+					<span></span>
+					<span></span>
+					<span>1050-202</span>
+					<span>Lisboa</span>
+					<span>PORTUGAL</span>
+					<span></span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Telephone</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					(+<span>351</span> )
+					<span>21 952 74 50</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Contact details of your organisation's office in charge of EU relations  </strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Same as the head office</span>
+				</td>
+			</tr>
+			
+			
+			
+		</tbody>
+	</table>
+	<h2 id="person-with-legal-responsibility">Person with legal responsibility</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Person with legal responsibility for the organisation</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Mr</span>
+					<span>Marco</span>
+					<span>Domingues</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Position</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>President</span>
+				</td>
+			</tr>
+			
+		</tbody>
+	</table>
+	<h2 id="person-in-charge-of-eu-relations">Person in charge of EU relations</h2>
+	<table class="ecl-table ecl-table--zebra">
+
+		<tbody class="ecl-table__body">
+			
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Person in charge of EU relations</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Ms</span>
+					<span>Sara</span>
+					<span>Trindade</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Position</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Direction Member</span>
+				</td>
+			</tr>
+			
+		</tbody>
+	</table>
+	
+	
+	
+	
+	<h2 id="goalsremit">Goals/remit</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Goals/remits of your organisation</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">MISSÃO
+
+Valorizar, promover e reforçar o desenvolvimento local, a cidadania ativa, a igualdade e a coesão social na sociedade portuguesa, enquanto pilares de uma sociedade mais justa, equitativa, solidária e sustentável.
+
+VISÃO
+
+Ser reconhecida pela sociedade civil e pelo Estado, como a organização de referência promotora do desenvolvimento integrado, na diversidade de contextos, organizações e territórios.
+
+CULTURA
+
+Ser laica, apartidária, autónoma do Estado e promotora de interesses coletivos e representativos da sociedade civil; Ser uma organização de pontes para a convergência e concertação das organizações da sociedade civil, cidadãos e cidadãs, no reforço do interesse comum junto do Estado; Assumir a sua identidade na diversidade de organizações, indivíduos, territórios e contextos de atuação, e daí, destacar a multiplicidade de modelos de desenvolvimento local; Assumir a pluralidade de opiniões e modelos de atuação enquanto desafio inerente à promoção do desenvo (...)</pre>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Level of interest represented</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<div>
+						<ul>
+							<li>
+								<span>National</span>
+							</li>
+						</ul>
+					</div>
+					
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="interests-represented">Interests represented</h2>
+	<table class="ecl-table ecl-table--zebra">
+
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Applicant/registrant's representation</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>
+						<span>Promotes their own interests or the collective interests of their members</span>
+					</span>
+					
+					
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="specific-activities-covered-by-the-register">Specific activities covered by the Register</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Main EU legislative proposals or policies targeted</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">Desenvolvimento Local
+Governança, Cidadania e Igualdade
+Sustentabilidade, Coesão Social e Territorial
+Inovação e Empreendedorismo
+Empregabilidade
+Economia Social</pre>
+				</td>
+
+			</tr>
+
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Communication activities (events, campaigns, publications, etc.) related to the EU policies above</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">https://www.animar-dl.pt/
+https://www.animar-dl.pt/recursos/
+https://www.animar-dl.pt/projetos/</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Intergroups and unofficial groupings (European Parliament)</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<div>
+						<ul>
+							<li>
+								
+								<span>
+									<span>Unofficial groupings</span>:
+									<span>_Social Economy Europe (a partir da CASES - Cooperativa António Sérgio para a Economia Social)</span>
+								</span>
+							</li>
+						</ul>
+					</div>
+					
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Participation in other EU supported forums and platforms</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">_Comissão de Acompanhamento do PDR2020
+_ERASMUS+
+_Confederação Portuguesa de Economia Social</pre>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<table class="ecl-table ecl-table--zebra ecl-u-mt-2xl ecl-u-border-top">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell">
+					<strong>List of meetings with European Commission: </strong>
+					<div class="ecl-popover" data-ecl-auto-init="Popover" data-ecl-auto-initialized="true">
+						<a href="javascript:void(0)" class="ecl-link ecl-link--standalone ecl-link--icon ecl-link--icon-before ecl-popover__toggle" aria-controls="popover-meetings" data-ecl-popover-toggle="" aria-expanded="false">
+							<svg class="ecl-icon ecl-icon--fluid ecl-link__icon" focusable="false" aria-hidden="true"><use xlink:href="/themes/contrib/oe_theme/dist/eu/images/icons/sprites/icons.svg#information"></use></svg>
+						</a>
+						  <div id="popover-meetings" class="ecl-popover__container" hidden="" style="width: 25em">
+							<div class="ecl-popover__content">This field displays the list of any meetings the registrant has held with Commissioners, Members of their Cabinet or Director-Generals since 01/12/2014 under its current identification number in the Register.</div>
+						  </div>
+					</div>
+				</td>
+				
+				<td>
+					<span>N/A</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Open Public Consultations">
+					<strong>List of contributions to public consultations</strong>:
+					<div class="ecl-popover" data-ecl-auto-init="Popover" data-ecl-auto-initialized="true">
+						<a href="javascript:void(0)" class="ecl-link ecl-link--standalone ecl-link--icon ecl-link--icon-before ecl-popover__toggle" aria-controls="popover-publicConsultation" data-ecl-popover-toggle="" aria-expanded="false">
+							<svg class="ecl-icon ecl-icon--fluid ecl-link__icon" focusable="false" aria-hidden="true"><use xlink:href="/themes/contrib/oe_theme/dist/eu/images/icons/sprites/icons.svg#information"></use></svg>
+						</a>
+						<div id="popover-publicConsultation" class="ecl-popover__container" hidden="" style="width: 25em">
+							<div class="ecl-popover__content">This field displays list of public consultations to which the entity contributed since 24/07/2018 under its current identification number in the Register (provided that the entity indicated the TR ID/REG number in its contribution).</div>
+						</div>
+					</div>
+				</td>
+				<td class="ecl-table__cell">
+					<ul>
+						<li>
+							<a href="https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives/12722-demographic-change-in-europe---green-paper-on-ageing/public-consultation" target="_blank" class="ecl-link ecl-link--standalone ecl-link--icon ecl-link--icon-after">
+								<span>Demographic change in Europe - green paper on ageing</span></a>
+						</li>
+					</ul>
+					
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Roadmaps">
+					<strong>List of contributions to roadmaps</strong>:
+					<div class="ecl-popover" data-ecl-auto-init="Popover" data-ecl-auto-initialized="true">
+						<a href="javascript:void(0)" class="ecl-link ecl-link--standalone ecl-link--icon ecl-link--icon-before ecl-popover__toggle" aria-controls="popover-roadmaps" data-ecl-popover-toggle="" aria-expanded="false">
+							<svg class="ecl-icon ecl-icon--fluid ecl-link__icon" focusable="false" aria-hidden="true"><use xlink:href="/themes/contrib/oe_theme/dist/eu/images/icons/sprites/icons.svg#information"></use></svg>
+						</a>
+						<div id="popover-roadmaps" class="ecl-popover__container" hidden="" style="width: 25em">
+							<div class="ecl-popover__content">This field displays list of roadmaps to which the entity contributed since 24/07/2018 under its current identification number in the Register (provided that the entity indicated the TR ID number in its contribution)</div>
+						</div>
+					</div>
+				</td>
+				<td class="ecl-table__cell">
+					
+					<span>N/A</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Commission expert groups and other similar entities (European commission)</strong>:
+					<div class="ecl-popover" data-ecl-auto-init="Popover" data-ecl-auto-initialized="true">
+						<a href="javascript:void(0)" class="ecl-link ecl-link--standalone ecl-link--icon ecl-link--icon-before ecl-popover__toggle" aria-controls="popover-experts" data-ecl-popover-toggle="" aria-expanded="false">
+							<svg class="ecl-icon ecl-icon--fluid ecl-link__icon" focusable="false" aria-hidden="true"><use xlink:href="/themes/contrib/oe_theme/dist/eu/images/icons/sprites/icons.svg#information"></use></svg>
+						</a>
+						  <div id="popover-experts" class="ecl-popover__container" hidden="" style="width: 25em">
+							<div class="ecl-popover__content">This field displays membership of any active Expert groups and is limited to Type C (Organisation) and Type B (Individual expert appointed as a representative of a common interest) members.</div>
+						  </div>
+					</div>
+				</td>
+				<td class="ecl-table__cell">
+
+
+					
+					<div>
+						<span>N/A</span>
+					</div>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell">
+					<strong>List of meetings with the European Parliament</strong>:
+				</td>
+				<td><a target="_blank" href="https://www.europarl.europa.eu/meps/en/search-meetings?transparencyRegisterIds=500479542151-73"><span>Meeting declarations</span></a></td>
+
+			</tr>
+		</tbody>
+	</table>
+
+	<h2 id="number-of-persons-involved-in-the-activities">Number of persons involved in the activities</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Number of persons involved from your organisation expressed in % of working time</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<b>100%</b>:
+					<span>0</span>,
+					<b>75%</b>:
+					<span>0</span>,
+					<b>50%</b>:
+					<span>0</span>,
+					<b>25%</b>:
+					<span>0</span>,
+					<b>10%</b>:
+					<span>2</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Number of persons involved (total)</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>2</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Full time equivalent (FTE)</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>0.2</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Complementary information</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">N/A</pre>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="persons-accredited-for-access-to-european-parliament-premises">Persons accredited for access to European Parliament premises</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<span class="ecl-col-m-12 ecl-u-mt-m ecl-u-type-align-center">No accredited persons</span>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="fields-of-interest">Fields of interest</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Fields of interest</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<ul>
+						<li>
+							<span>Agriculture and rural development</span>
+						</li>
+						<li>
+							<span>Climate action</span>
+						</li>
+						<li>
+							<span>Culture</span>
+						</li>
+						<li>
+							<span>Culture and media</span>
+						</li>
+						<li>
+							<span>Digital economy and society</span>
+						</li>
+						<li>
+							<span>Education and training</span>
+						</li>
+						<li>
+							<span>Employment and social affairs</span>
+						</li>
+						<li>
+							<span>Environment</span>
+						</li>
+						<li>
+							<span>International co-operation and development</span>
+						</li>
+						<li>
+							<span>Migration and asylum</span>
+						</li>
+						<li>
+							<span>Regional policy</span>
+						</li>
+						<li>
+							<span>Research and innovation</span>
+						</li>
+						<li>
+							<span>Youth</span>
+						</li>
+					</ul>
+					
+				</td>
+			</tr>
+			
+			
+			
+		</tbody>
+	</table>
+	<h2 id="membership-and-affiliation">Membership and affiliation</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>List of membership in associations, (con)federations, networks and other bodies </strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">https://www.animar-dl.pt/quem-somos/filiacoes-e-parcerias/</pre>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>List of members and affiliate/partner organisations </strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">https://www.animar-dl.pt/entidades/</pre>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="category-of-registration">Category of registration</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Category of registration</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>Non-governmental organisations, platforms and networks and similar</span>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+
+	<h2 id="financial-data">Financial data</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Closed financial year</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>01/2022 - 12/2022</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>EU grants for the most recent closed financial year</strong>:
+				</td>
+				
+
+				<td class="ecl-table__cell">
+					<span>N/A</span>
+				</td>
+			</tr>
+
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>EU grants for the current financial year</strong>:
+				</td>
+				
+
+				<td class="ecl-table__cell">
+					<span>N/A</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Intermediaries in the most recent closed financial year</strong>:
+				</td>
+				
+				<td class="ecl-table__cell">
+					<span>N/A</span>
+				</td>
+
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell">
+					<strong>Intermediaries in the current financial year</strong>:
+				</td>
+				
+				<td class="ecl-table__cell">
+					<span>N/A</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell">
+					<strong>Estimate of annual costs related to activities covered by the register</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<!-- <10000 -->
+					
+					<!-- >=1000000 -->
+					
+					<!-- - -->
+					<span>
+						<span>€10,000</span>
+								-
+						<span>€24,999</span>
+					</span>
+				</td>
+			</tr>
+			
+			
+			
+			
+			
+			
+			
+			
+			<tr>
+				<td class="ecl-table__cell">
+					<strong>Complementary information</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<pre style="word-wrap: break-word; white-space: pre-line; font-family: var(--eui-base-font-family);">N/A</pre>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+	<h2 id="code-of-conduct">Code of conduct</h2>
+	<table class="ecl-table ecl-table--zebra">
+		<tbody class="ecl-table__body">
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>Code of conduct</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>By its registration the organisation has signed the Transparency Register Code of Conduct</span>
+				</td>
+			</tr>
+			<tr class="ecl-table__row">
+				<td class="ecl-table__cell" data-ecl-table-header="Profile of registrant">
+					<strong>If the applicant/registrant is also bound by another(professional) code of conduct it can be indicated in this space</strong>:
+				</td>
+				<td class="ecl-table__cell">
+					<span>N/A</span>
+				</td>
+			</tr>
+		</tbody>
+	</table>
+</div>
+
+</div>
+    '''
+
+    #data = extract_from_html(html)
+    
+    #pprint(data['Fields of interest'])
+    update_info_from_html()
+    #asyncio.get_event_loop().run_until_complete(main())
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..af95fba
--- /dev/null
+++ b/test.py
@@ -0,0 +1,10 @@
+from _arango import ArangoDB
+arango = ArangoDB()
+initiatives = [i for i in arango.db.collection('eu_initiatives').all()]
+ordered_by_headline = sorted(initiatives, key=lambda x: x['headline'])
+
+s = set()
+for i in ordered_by_headline:
+    print(i['headline'])
+    s.add(i['headline'])
+print(len(s))
\ No newline at end of file