from pprint import pprint import asyncio from pyppeteer import launch from bs4 import BeautifulSoup from _arango import ArangoDB from time import sleep from colorprinter.print_color import * async def get_info(browser, id_number): try: page = await browser.newPage() await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') url = f'https://transparency-register.europa.eu/search-details_en?id={id_number}' await page.goto(url, {'waitUntil': 'networkidle2'}) content = await page.content() await page.close() soup = BeautifulSoup(content, 'html.parser') info_html = soup.find_all('div', class_='ecl')[0] headers = info_html.find_all('h2') tables = info_html.find_all('table') headers_and_tables = zip(headers, tables) data = {} for header, table in headers_and_tables: header_text = header.text.strip() data[header_text] = {} rows = table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) == 2: row_name = cells[0].get_text(strip=True) cell_content = cells[1] # Check if the cell contains a list ul = cell_content.find('ul') if ul: list_items = [li.get_text(strip=True) for li in ul.find_all('li')] data[header_text][row_name] = {'list': list_items} else: cell_text = cell_content.get_text(strip=True) links = cell_content.find_all('a') cell_links = {link.get_text(strip=True): link['href'] for link in links} if cell_links: data[header_text][row_name] = {'text': cell_text, 'links': cell_links} else: data[header_text][row_name] = {'text': cell_text} data['html'] = str(info_html) return data except Exception as e: print(f"Error fetching info for ID {id_number}: {e}") return None def update_info_from_html(): arango = ArangoDB() arango_docs = [i for i in arango.db.collection('eu_lobbyists').all()] na = len(arango_docs) n=0 new_docs = [] for doc in arango_docs: n += 1 html = doc['html'] data = extract_from_html(html, {'html': html}) data['_key'] = doc['_key'] new_docs.append(data) print(f'{n}/{na}', end='\r') arango.db.collection('eu_lobbyists').insert_many(data, overwrite=True) def extract_from_html(html, data = {}): soup = BeautifulSoup(html, 'html.parser') info_html = soup.find_all('div', class_='ecl')[0] headers = info_html.find_all('h2') tables = info_html.find_all('table', {'class': 'ecl-table ecl-table--zebra'}) headers_and_tables = zip(headers, tables) for header, table in headers_and_tables: header_text = header.text.strip() if header_text not in data: data[header_text] = {} rows = table.find_all('tr') table_data = {} for row in rows: cells = row.find_all('td') if len(cells) == 2: row_name = cells[0].get_text(strip=True) cell_content = cells[1] elif len(cells) == 1: row_name = header_text cell_content = cells[0] else: continue # Check if the cell contains a list ul = cell_content.find('ul') if ul: list_items = [li.get_text(strip=True) for li in ul.find_all('li')] table_data[row_name] = list_items if header_text == row_name: table_data = list_items else: cell_text = cell_content.get_text(strip=True) links = cell_content.find_all('a') cell_links = {link.get_text(strip=True): link['href'] for link in links} if cell_links: table_data[row_name] = {'text': cell_text, 'links': cell_links} else: table_data[row_name] = {'text': cell_text} for k, v in table_data.items(): data[header_text][k] = v return data async def get_all_lobbyists(browser, page_number): try: page = await browser.newPage() await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') url = f'https://transparency-register.europa.eu/searchregister-or-update/search-register_en?searchType=REGISTRANTS&page={page_number}#list-of-all-lobbyists' await page.goto(url, {'waitUntil': 'networkidle2'}) content = await page.content() await page.close() soup = BeautifulSoup(content, 'html.parser') links = soup.find_all('a', class_='ecl-link') ids = [link['href'].split('=')[-1] for link in links if 'search-details' in link['href']] return ids except Exception as e: print(f"Error fetching lobbyists for page {page_number}: {e}") return [] async def main(): arango = ArangoDB() if not arango.db.has_collection('eu_lobbyists'): arango.db.create_collection('eu_lobbyists') #arango.db.collection('eu_lobbyists').truncate() browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) try: for page_number in range(1, 148): sleep(1.3) ids = await get_all_lobbyists(browser, page_number) tasks = [] for id_number in ids: if not arango.db.collection('eu_lobbyists').get(id_number): tasks.append(get_info(browser, id_number)) sleep(1.6) results = await asyncio.gather(*tasks) for id_number, data in zip(ids, results): if data: data['_key'] = id_number if 'Profile of registrant' in data: arango.db.collection('eu_lobbyists').insert(data, overwrite=True) print(f'Inserted {id_number}') else: print(f'"Profile of registrant" not in {id_number}') finally: await browser.close() if __name__ == '__main__': html = '''

Profile of registrant

Organisation name: Associação Portuguesa para o Desenvolvimento Local
REG Number: 500479542151-73
Status: Activated
Registration date: 07/04/2021 19:44:13
The registrant performed the last (partial or annual) update on: 29/02/2024 13:18:08
Next annual update due latest on: 28/02/2025

Applicant/registrant: organisation or self-employed individuals

Organisation name: Associação Portuguesa para o Desenvolvimento Local
Acronym: ANIMAR
Form of entity: Associação Sem Fim Lucrativo
Website: http://animar-dl.pt

Contact details

Contact details of your organisation's head office:
Address: Av. Santos Dumont, 57 - 1º Esq. Avenidas Novas 1050-202 Lisboa PORTUGAL
Telephone: (+351 ) 21 952 74 50
Contact details of your organisation's office in charge of EU relations : Same as the head office
Person with legal responsibility for the organisation: Mr Marco Domingues
Position: President

Person in charge of EU relations

Person in charge of EU relations: Ms Sara Trindade
Position: Direction Member

Goals/remit

Goals/remits of your organisation:
MISSÃO

Valorizar, promover e reforçar o desenvolvimento local, a cidadania ativa, a igualdade e a coesão social na sociedade portuguesa, enquanto pilares de uma sociedade mais justa, equitativa, solidária e sustentável.

VISÃO

Ser reconhecida pela sociedade civil e pelo Estado, como a organização de referência promotora do desenvolvimento integrado, na diversidade de contextos, organizações e territórios.

CULTURA

Ser laica, apartidária, autónoma do Estado e promotora de interesses coletivos e representativos da sociedade civil; Ser uma organização de pontes para a convergência e concertação das organizações da sociedade civil, cidadãos e cidadãs, no reforço do interesse comum junto do Estado; Assumir a sua identidade na diversidade de organizações, indivíduos, territórios e contextos de atuação, e daí, destacar a multiplicidade de modelos de desenvolvimento local; Assumir a pluralidade de opiniões e modelos de atuação enquanto desafio inerente à promoção do desenvo (...)
Level of interest represented:
  • National

Interests represented

Applicant/registrant's representation: Promotes their own interests or the collective interests of their members

Specific activities covered by the Register

Main EU legislative proposals or policies targeted:
Desenvolvimento Local
Governança, Cidadania e Igualdade
Sustentabilidade, Coesão Social e Territorial
Inovação e Empreendedorismo
Empregabilidade
Economia Social
Communication activities (events, campaigns, publications, etc.) related to the EU policies above: https://www.animar-dl.pt/ https://www.animar-dl.pt/recursos/ https://www.animar-dl.pt/projetos/
Intergroups and unofficial groupings (European Parliament):
  • Unofficial groupings: _Social Economy Europe (a partir da CASES - Cooperativa António Sérgio para a Economia Social)
Participation in other EU supported forums and platforms:
_Comissão de Acompanhamento do PDR2020
_ERASMUS+
_Confederação Portuguesa de Economia Social
List of meetings with European Commission:
N/A
List of contributions to public consultations:
List of contributions to roadmaps:
N/A
Commission expert groups and other similar entities (European commission):
N/A
List of meetings with the European Parliament: Meeting declarations

Number of persons involved in the activities

Number of persons involved from your organisation expressed in % of working time: 100%: 0, 75%: 0, 50%: 0, 25%: 0, 10%: 2
Number of persons involved (total): 2
Full time equivalent (FTE): 0.2
Complementary information:
N/A

Persons accredited for access to European Parliament premises

No accredited persons

Fields of interest

Fields of interest:
  • Agriculture and rural development
  • Climate action
  • Culture
  • Culture and media
  • Digital economy and society
  • Education and training
  • Employment and social affairs
  • Environment
  • International co-operation and development
  • Migration and asylum
  • Regional policy
  • Research and innovation
  • Youth

Membership and affiliation

List of membership in associations, (con)federations, networks and other bodies :
https://www.animar-dl.pt/quem-somos/filiacoes-e-parcerias/
List of members and affiliate/partner organisations :
https://www.animar-dl.pt/entidades/

Category of registration

Category of registration: Non-governmental organisations, platforms and networks and similar

Financial data

Closed financial year: 01/2022 - 12/2022
EU grants for the most recent closed financial year: N/A
EU grants for the current financial year: N/A
Intermediaries in the most recent closed financial year: N/A
Intermediaries in the current financial year: N/A
Estimate of annual costs related to activities covered by the register: €10,000 - €24,999
Complementary information:
N/A

Code of conduct

Code of conduct: By its registration the organisation has signed the Transparency Register Code of Conduct
If the applicant/registrant is also bound by another(professional) code of conduct it can be indicated in this space: N/A
''' #data = extract_from_html(html) #pprint(data['Fields of interest']) update_info_from_html() #asyncio.get_event_loop().run_until_complete(main())