From 6d1d97a84963e00c414e6b3c45a9d65f2b859a0a Mon Sep 17 00:00:00 2001 From: lasseedfast <> Date: Tue, 15 Oct 2024 15:20:28 +0200 Subject: [PATCH] Refactor code structure and remove unused config.json file --- _llm.py | 27 +- config.json | 3 - enrich_sci_articles.py | 139 +++++++ make_summaries.py | 85 +++++ parl_meetings.py | 828 +++++++++++++++++++++++++++++++++++++++++ test.py | 10 + 6 files changed, 1081 insertions(+), 11 deletions(-) delete mode 100644 config.json create mode 100644 enrich_sci_articles.py create mode 100644 make_summaries.py create mode 100644 parl_meetings.py create mode 100644 test.py diff --git a/_llm.py b/_llm.py index 68194f8..60db3f4 100644 --- a/_llm.py +++ b/_llm.py @@ -1,26 +1,37 @@ from ollama import Client import os import env_manager + env_manager.set_env() + class LLM: - def __init__(self, system_message=None, num_ctx=2000, temperature=0, chat=True) -> None: - self.llm_model = os.getenv("LLM_MODEL") + def __init__( + self, system_message=None, num_ctx=20000, temperature=0, chat=True + ) -> None: + self.llm_model = "mistral-nemo:12b-instruct-2407-q5_K_M" #os.getenv("LLM_MODEL") self.system_message = system_message self.options = {"temperature": temperature, "num_ctx": num_ctx} - self.messages = [{'role': 'system', 'content': self.system_message}] + self.messages = [{"role": "system", "content": self.system_message}] self.chat = chat - self.ollama = Client(host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}') + self.ollama = Client( + host=f'{os.getenv("LLM_URL")}:{os.getenv("LLM_PORT")}', + ) def generate(self, prompt: str) -> str: self.messages.append({"role": "user", "content": prompt}) - result = self.ollama.chat(model=self.llm_model, messages=self.messages, options=self.options, ) + result = self.ollama.chat( + model=self.llm_model, messages=self.messages, options=self.options + ) - answer = result['message']['content'] + answer = result["message"]["content"] self.messages.append({"role": "assistant", "content": answer}) if not self.chat: - self.messages = [{'role': 'system', 'content': self.system_message}] + self.messages = [{"role": "system", "content": self.system_message}] return answer - + +if __name__ == "__main__": + llm = LLM() + print(llm.generate("Why is the sky red?")) \ No newline at end of file diff --git a/config.json b/config.json deleted file mode 100644 index 57fc282..0000000 --- a/config.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "apikey": "eee5c352c30f3586f1cc42d0a07ce440" -} \ No newline at end of file diff --git a/enrich_sci_articles.py b/enrich_sci_articles.py new file mode 100644 index 0000000..871a3c2 --- /dev/null +++ b/enrich_sci_articles.py @@ -0,0 +1,139 @@ +from _arango import ArangoDB +from _llm import LLM +import tiktoken +from colorprinter.print_color import * + +def make_summaries(): + + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("cl100k_base") + + def count_tokens(text): + tokens = tokenizer.encode(text) + return len(tokens) + + articles = [i for i in arango.db.aql.execute(''' + for doc in sci_articles + return doc + ''')] + + for article in articles: + num_tokens = count_tokens(article["text"]) + llm = LLM( + system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.", + num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000, + temperature=0, + ) + try: + prompt = f''' + Make a summary of the following text: + """ + {article["text"]} + """ + Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion. + Everything about electric vehicles, and things related to electric cars, is very important. + Write the summary as if you are writing for someone who is not familiar with the topic. + Write it from the point of the view of the author of the text. + ''' + + article["summary"] = { + "meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']}, + "text_sum": llm.generate(prompt), + } + print(article["summary"]) + arango.db.collection("sci_articles").update(article) + except Exception as e: + print(e) + article['summary_error'] = str(e) + arango.db.collection("sci_articles").update(article) + continue + + +def make_chunk_qa(num_qa=5): + articles = [i for i in arango.db.aql.execute(''' + for doc in sci_articles + return doc + ''')] + + for article in articles: + try: + if 'abstract' in article['metadata']: + abstract = article['metadata']['abstract'].replace('', '') + else: + abstract = article['summary']['text_sum'] + + question_machine = LLM( + system_message= f'''You are creating questions based on scientific articles. You will be given one text snippet from the article at a time and you should create {num_qa} questions based on that snippet. + To understand the article as a whole you can read this abstract: + """ + {abstract} + """ + The {num_qa} questions should be based on the text snippet and should be answerable by the text, but you can check the conversation history to make them more relevant for the context. + Don't write general questions like "what is the text about?", but rather questions that reflect the facts in the text. + The questions will be used in a CSV file so it's important that you answer on the format: "question1;question2;question3;question4;question5". + Always make {num_qa} questions to every text! + ''', + num_ctx=20000, + temperature=0.2, + ) + + answer_machine = LLM( + system_message=f'''You are answering questions about a text snippet from a scientific article. You will be given one question and one text snippet at a time and you should answer the questions based on that snippet. + The answers should be based on the text snippet, but you can check the conversation history to make them more relevant for the context. + Answer ONLY with the answer to the question, not a reasoning where you explain why you think that is the answer. + Make the answers long enough to be informative and contain relevant information, but not too long. + ''', + num_ctx=20000, + temperature=0.2, + ) + + for chunk in article["chunks"]: + if 'qa' in chunk: + continue + chunk["qa"] = [] + prompt = f''' + """ + {chunk['text']} + """ + Remember: + - If there is something in the text about electric cars, please include that in the question. + - Don't write general questinos like "what is the text about?" or "what is the main point of the text?", but rather questions that can be answered by the text. The questions will be used to query a vector database. + - Answer on the format: "question1;question2;question3;question4;question5" as the questions will be used in a CSV file. Answer ONLY with the questions, not anything else! + ''' + questions = question_machine.generate(prompt).split(';') + for question in questions: + print_blue(question) + if questions.index(question) == 0: + prompt = f''' + Answer the following question based on the text snippet below: {question} + """ + {chunk['text']} + """ + Remember: + - The answer should be based on the text. + - If there is something in the text about electric cars, please include that in the answer. + - Answer ONLY with the answer, nothing else. + ''' + else: + prompt = question + + answer = answer_machine.generate(question) + print_green(answer) + qa = { + "question": question, + "answer": answer, + } + chunk["qa"].append(qa) + + arango.db.collection("sci_articles").update(article, check_rev=False) + except Exception as e: + print(e) + article['qa_error'] = str(e) + arango.db.collection("sci_articles").update(article, check_rev=False) + continue + + + +if __name__ == "__main__": + arango = ArangoDB() + make_chunk_qa() \ No newline at end of file diff --git a/make_summaries.py b/make_summaries.py new file mode 100644 index 0000000..8274dad --- /dev/null +++ b/make_summaries.py @@ -0,0 +1,85 @@ +from _arango import ArangoDB +from _llm import LLM +import tiktoken + + +def make_summaries(): + + # Initialize the tokenizer + tokenizer = tiktoken.get_encoding("cl100k_base") + + def count_tokens(text): + tokens = tokenizer.encode(text) + return len(tokens) + + articles = [i for i in arango.db.aql.execute(''' + for doc in sci_articles + return doc + ''')] + + for article in articles: + num_tokens = count_tokens(article["text"]) + llm = LLM( + system_message="You are summarising scientific articles. It is very important that you keep to what is written and do not add any of your own opinions or interpretations.", + num_ctx=num_tokens+3000 if num_tokens < 67000 else 70000, + temperature=0, + ) + try: + prompt = f''' + Make a summary of the following text: + """ + {article["text"]} + """ + Write a detailed summary. Make sure to include information from all sections: introduction, methods, results, and conclusion. + Everything about electric vehicles, and things related to electric cars, is very important. + Write the summary as if you are writing for someone who is not familiar with the topic. + Write it from the point of the view of the author of the text. + ''' + + article["summary"] = { + "meta": {"model": llm.llm_model, "system_message": llm.system_message, 'num_ctx': llm.options['num_ctx'], 'temperature': llm.options['temperature']}, + "text_sum": llm.generate(prompt), + } + print(article["summary"]) + arango.db.collection("sci_articles").update(article) + except Exception as e: + print(e) + article['summary_error'] = str(e) + arango.db.collection("sci_articles").update(article) + continue + + +def make_chunk_qa(): + articles = [i for i in arango.db.aql.execute(''' + for doc in sci_articles + return doc + ''')] + + for article in articles: + questin_machine = LLM( + system_message="You are creating questions based on scientific articles. The questions should be based on the text and should be answerable by the text, but you can check the conversation history to make them more relevant for the context.", + num_ctx=20000, + temperature=0.2, + ) + + answer_machine = LLM( + system_message="You are answering questions based on scientific articles. The answers should be based on the text, but you can check the conversation history to make them more relevant for the context.", + num_ctx=20000, + temperature=0.2, + ) + + for chunk in article["chunks"]: + prompt = f''' + Create a question based on the following text: + """ + {chunk['text']} + """ + Write a question that can be answered by the text. Make sure to include information from all sections: introduction, methods, results, and conclusion. + Everything about electric vehicles, and things related to electric cars, is very important. + Write the question as if you are writing for someone who is not familiar with the topic. + Write it from the point of the view of the author of the text. + ''' + + +if __name__ == "__main__": + arango = ArangoDB() \ No newline at end of file diff --git a/parl_meetings.py b/parl_meetings.py new file mode 100644 index 0000000..8da001a --- /dev/null +++ b/parl_meetings.py @@ -0,0 +1,828 @@ +from pprint import pprint +import asyncio +from pyppeteer import launch +from bs4 import BeautifulSoup +from _arango import ArangoDB +from time import sleep +from colorprinter.print_color import * + +async def get_info(browser, id_number): + try: + page = await browser.newPage() + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') + + url = f'https://transparency-register.europa.eu/search-details_en?id={id_number}' + await page.goto(url, {'waitUntil': 'networkidle2'}) + content = await page.content() + await page.close() + + soup = BeautifulSoup(content, 'html.parser') + info_html = soup.find_all('div', class_='ecl')[0] + headers = info_html.find_all('h2') + tables = info_html.find_all('table') + headers_and_tables = zip(headers, tables) + + data = {} + for header, table in headers_and_tables: + header_text = header.text.strip() + data[header_text] = {} + rows = table.find_all('tr') + for row in rows: + cells = row.find_all('td') + if len(cells) == 2: + row_name = cells[0].get_text(strip=True) + cell_content = cells[1] + + # Check if the cell contains a list + ul = cell_content.find('ul') + if ul: + list_items = [li.get_text(strip=True) for li in ul.find_all('li')] + data[header_text][row_name] = {'list': list_items} + else: + cell_text = cell_content.get_text(strip=True) + links = cell_content.find_all('a') + cell_links = {link.get_text(strip=True): link['href'] for link in links} + if cell_links: + data[header_text][row_name] = {'text': cell_text, 'links': cell_links} + else: + data[header_text][row_name] = {'text': cell_text} + + data['html'] = str(info_html) + return data + except Exception as e: + print(f"Error fetching info for ID {id_number}: {e}") + return None + + +def update_info_from_html(): + arango = ArangoDB() + arango_docs = [i for i in arango.db.collection('eu_lobbyists').all()] + na = len(arango_docs) + n=0 + new_docs = [] + for doc in arango_docs: + n += 1 + html = doc['html'] + data = extract_from_html(html, {'html': html}) + data['_key'] = doc['_key'] + new_docs.append(data) + print(f'{n}/{na}', end='\r') + arango.db.collection('eu_lobbyists').insert_many(data, overwrite=True) + + +def extract_from_html(html, data = {}): + soup = BeautifulSoup(html, 'html.parser') + info_html = soup.find_all('div', class_='ecl')[0] + headers = info_html.find_all('h2') + tables = info_html.find_all('table', {'class': 'ecl-table ecl-table--zebra'}) + headers_and_tables = zip(headers, tables) + + for header, table in headers_and_tables: + header_text = header.text.strip() + if header_text not in data: + data[header_text] = {} + rows = table.find_all('tr') + table_data = {} + for row in rows: + cells = row.find_all('td') + if len(cells) == 2: + row_name = cells[0].get_text(strip=True) + cell_content = cells[1] + elif len(cells) == 1: + row_name = header_text + cell_content = cells[0] + else: + continue + + # Check if the cell contains a list + ul = cell_content.find('ul') + if ul: + list_items = [li.get_text(strip=True) for li in ul.find_all('li')] + table_data[row_name] = list_items + if header_text == row_name: + table_data = list_items + else: + cell_text = cell_content.get_text(strip=True) + links = cell_content.find_all('a') + cell_links = {link.get_text(strip=True): link['href'] for link in links} + if cell_links: + table_data[row_name] = {'text': cell_text, 'links': cell_links} + else: + table_data[row_name] = {'text': cell_text} + for k, v in table_data.items(): + data[header_text][k] = v + return data + + + + +async def get_all_lobbyists(browser, page_number): + try: + page = await browser.newPage() + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') + + url = f'https://transparency-register.europa.eu/searchregister-or-update/search-register_en?searchType=REGISTRANTS&page={page_number}#list-of-all-lobbyists' + await page.goto(url, {'waitUntil': 'networkidle2'}) + content = await page.content() + await page.close() + + soup = BeautifulSoup(content, 'html.parser') + links = soup.find_all('a', class_='ecl-link') + ids = [link['href'].split('=')[-1] for link in links if 'search-details' in link['href']] + return ids + except Exception as e: + print(f"Error fetching lobbyists for page {page_number}: {e}") + return [] + +async def main(): + arango = ArangoDB() + if not arango.db.has_collection('eu_lobbyists'): + arango.db.create_collection('eu_lobbyists') + #arango.db.collection('eu_lobbyists').truncate() + + browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) + + try: + for page_number in range(1, 148): + sleep(1.3) + ids = await get_all_lobbyists(browser, page_number) + tasks = [] + for id_number in ids: + if not arango.db.collection('eu_lobbyists').get(id_number): + tasks.append(get_info(browser, id_number)) + sleep(1.6) + + results = await asyncio.gather(*tasks) + for id_number, data in zip(ids, results): + if data: + data['_key'] = id_number + if 'Profile of registrant' in data: + arango.db.collection('eu_lobbyists').insert(data, overwrite=True) + print(f'Inserted {id_number}') + else: + print(f'"Profile of registrant" not in {id_number}') + finally: + await browser.close() + +if __name__ == '__main__': + + html = ''' +
+
+ + + +

Profile of registrant

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Organisation name: + + Associação Portuguesa para o Desenvolvimento Local +
+ REG Number: + + 500479542151-73 +
+ Status: + Activated
+ Registration date: + 07/04/2021 19:44:13
+ The registrant performed the last (partial or annual) update on: + 29/02/2024 13:18:08
+ Next annual update due latest on: + 28/02/2025
+

Applicant/registrant: organisation or self-employed individuals

+ + + + + + + + + + + + + + + + + + + + +
+ Organisation name: + + Associação Portuguesa para o Desenvolvimento Local +
+ Acronym: + + ANIMAR +
+ Form of entity: + + Associação Sem Fim Lucrativo +
+ Website: + + http://animar-dl.pt + +
+

Contact details

+ + + + + + + + + + + + + + + + + + + + + +
+ Contact details of your organisation's head office: +
+ Address: + + Av. Santos Dumont, 57 - 1º Esq. Avenidas Novas + + + 1050-202 + Lisboa + PORTUGAL + +
+ Telephone: + + (+351 ) + 21 952 74 50 +
+ Contact details of your organisation's office in charge of EU relations : + + Same as the head office +
+ + + + + + + + + + + + + +
+ Person with legal responsibility for the organisation: + + Mr + Marco + Domingues +
+ Position: + + President +
+

Person in charge of EU relations

+ + + + + + + + + + + + + + +
+ Person in charge of EU relations: + + Ms + Sara + Trindade +
+ Position: + + Direction Member +
+ + + + +

Goals/remit

+ + + + + + + + + + + +
+ Goals/remits of your organisation: + +
MISSÃO
+
+Valorizar, promover e reforçar o desenvolvimento local, a cidadania ativa, a igualdade e a coesão social na sociedade portuguesa, enquanto pilares de uma sociedade mais justa, equitativa, solidária e sustentável.
+
+VISÃO
+
+Ser reconhecida pela sociedade civil e pelo Estado, como a organização de referência promotora do desenvolvimento integrado, na diversidade de contextos, organizações e territórios.
+
+CULTURA
+
+Ser laica, apartidária, autónoma do Estado e promotora de interesses coletivos e representativos da sociedade civil; Ser uma organização de pontes para a convergência e concertação das organizações da sociedade civil, cidadãos e cidadãs, no reforço do interesse comum junto do Estado; Assumir a sua identidade na diversidade de organizações, indivíduos, territórios e contextos de atuação, e daí, destacar a multiplicidade de modelos de desenvolvimento local; Assumir a pluralidade de opiniões e modelos de atuação enquanto desafio inerente à promoção do desenvo (...)
+
+ Level of interest represented: + +
+
    +
  • + National +
  • +
+
+ +
+

Interests represented

+ + + + + + + + +
+ Applicant/registrant's representation: + + + Promotes their own interests or the collective interests of their members + + + +
+

Specific activities covered by the Register

+ + + + + + + + + + + + + + + + + + + + + + +
+ Main EU legislative proposals or policies targeted: + +
Desenvolvimento Local
+Governança, Cidadania e Igualdade
+Sustentabilidade, Coesão Social e Territorial
+Inovação e Empreendedorismo
+Empregabilidade
+Economia Social
+
+ Communication activities (events, campaigns, publications, etc.) related to the EU policies above: + + https://www.animar-dl.pt/ +https://www.animar-dl.pt/recursos/ +https://www.animar-dl.pt/projetos/ +
+ Intergroups and unofficial groupings (European Parliament): + +
+
    +
  • + + + Unofficial groupings: + _Social Economy Europe (a partir da CASES - Cooperativa António Sérgio para a Economia Social) + +
  • +
+
+ +
+ Participation in other EU supported forums and platforms: + +
_Comissão de Acompanhamento do PDR2020
+_ERASMUS+
+_Confederação Portuguesa de Economia Social
+
+ + + + + + + + + + + + + + + + + + + + + + + + + +
+ List of meetings with European Commission: +
+ + +
+
+ N/A +
+ List of contributions to public consultations: +
+ + +
+
+ + +
+ List of contributions to roadmaps: +
+ + +
+
+ + N/A +
+ Commission expert groups and other similar entities (European commission): +
+ + +
+
+ + + +
+ N/A +
+
+ List of meetings with the European Parliament: + Meeting declarations
+ +

Number of persons involved in the activities

+ + + + + + + + + + + + + + + + + + + +
+ Number of persons involved from your organisation expressed in % of working time: + + 100%: + 0, + 75%: + 0, + 50%: + 0, + 25%: + 0, + 10%: + 2 +
+ Number of persons involved (total): + + 2 +
+ Full time equivalent (FTE): + + 0.2 +
+ Complementary information: + +
N/A
+
+

Persons accredited for access to European Parliament premises

+ + + + + + + +
+ No accredited persons +
+

Fields of interest

+ + + + + + + + + + +
+ Fields of interest: + +
    +
  • + Agriculture and rural development +
  • +
  • + Climate action +
  • +
  • + Culture +
  • +
  • + Culture and media +
  • +
  • + Digital economy and society +
  • +
  • + Education and training +
  • +
  • + Employment and social affairs +
  • +
  • + Environment +
  • +
  • + International co-operation and development +
  • +
  • + Migration and asylum +
  • +
  • + Regional policy +
  • +
  • + Research and innovation +
  • +
  • + Youth +
  • +
+ +
+

Membership and affiliation

+ + + + + + + + + + + +
+ List of membership in associations, (con)federations, networks and other bodies : + +
https://www.animar-dl.pt/quem-somos/filiacoes-e-parcerias/
+
+ List of members and affiliate/partner organisations : + +
https://www.animar-dl.pt/entidades/
+
+

Category of registration

+ + + + + + + +
+ Category of registration: + + Non-governmental organisations, platforms and networks and similar +
+ +

Financial data

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Closed financial year: + + 01/2022 - 12/2022 +
+ EU grants for the most recent closed financial year: + + N/A +
+ EU grants for the current financial year: + + N/A +
+ Intermediaries in the most recent closed financial year: + + N/A +
+ Intermediaries in the current financial year: + + N/A +
+ Estimate of annual costs related to activities covered by the register: + + + + + + + + €10,000 + - + €24,999 + +
+ Complementary information: + +
N/A
+
+

Code of conduct

+ + + + + + + + + + + +
+ Code of conduct: + + By its registration the organisation has signed the Transparency Register Code of Conduct +
+ If the applicant/registrant is also bound by another(professional) code of conduct it can be indicated in this space: + + N/A +
+
+ +
+ ''' + + #data = extract_from_html(html) + + #pprint(data['Fields of interest']) + update_info_from_html() + #asyncio.get_event_loop().run_until_complete(main()) \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..af95fba --- /dev/null +++ b/test.py @@ -0,0 +1,10 @@ +from _arango import ArangoDB +arango = ArangoDB() +initiatives = [i for i in arango.db.collection('eu_initiatives').all()] +ordered_by_headline = sorted(initiatives, key=lambda x: x['headline']) + +s = set() +for i in ordered_by_headline: + print(i['headline']) + s.add(i['headline']) +print(len(s)) \ No newline at end of file