You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

163 lines
6.0 KiB

import re
import json
import asyncio
import os
from pyppeteer import launch
from bs4 import BeautifulSoup
from _arango import ArangoDB
from datetime import datetime
import random
from colorprinter.print_color import *
start = datetime.now()
def sanitize_filename(filename):
return re.sub(r'[\\/*?:"<>|]', "_", filename)
async def get_info(browser, doc, download_path):
page = await browser.newPage()
await page._client.send('Page.setDownloadBehavior', {
'behavior': 'allow',
'downloadPath': download_path
})
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
await page.goto(doc['link'], {'waitUntil': 'networkidle2'})
await asyncio.sleep(2)
content = await page.content()
# Locate and click the download button
try:
await page.click('a.ecl-file__download') # Adjust the selector as needed
await asyncio.sleep(5) # Wait for the download to complete
except Exception as e:
print(f"Error clicking download button: {e}")
await page.close()
# Rename the downloaded file
for filename in os.listdir(download_path):
if filename.endswith(".pdf") and '_' not in filename: # Adjust the file extension as needed
old_file = os.path.join(download_path, filename)
new_file = os.path.join(download_path, f"{doc['_key']}.pdf")
try:
# Check if the new file already exists
if os.path.exists(new_file):
os.remove(new_file)
os.rename(old_file, new_file)
except:
try:
os.rename(old_file, new_file[:50])
except:
os.remove(old_file)
doc['meta']['file_path_doc'] = new_file
break
soup = BeautifulSoup(content, 'html.parser')
consultation_button = soup.find('span', {'class': 'ecl-ecl-button__container'})
if consultation_button:
doc['consultation_link'] = f"https://ec.europa.eu{doc['link']}/public-consultation_en"
else:
doc['consultation_link'] = None
summary_div = soup.find('div', {'class': 'initiative-detail-summary'})
if summary_div:
doc['summary'] = summary_div.get_text(strip=True)
else:
doc['summary'] = ''
for link in soup.find_all('a'):
if 'All feedback and statistics' in link.get_text():
doc['feedback_url'] = f"https://ec.europa.eu{link['href']}"
break
return doc
async def get_all_initiatives(browser, page_number: int, status):
page = await browser.newPage()
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
url = f'https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives_en?feedbackStatus={status}&page={page_number}'
await page.goto(url, {'waitUntil': 'networkidle2'})
await asyncio.sleep(2)
content = await page.content()
await page.close()
soup = BeautifulSoup(content, 'html.parser')
html = soup.find_all('div', {'class': 'ux-block-content'})[0]
articles = html.find_all('article', {'class': 'search-result-item'})
if articles == []:
return None
initiatives = []
for article in articles:
try:
link_tag = article.find('a', {'class': 'ecl-u-pt-xs ecl-u-type-none'})
link = f"https://ec.europa.eu{link_tag['href']}"
headline = link_tag.find('div', {'class': 'search-result-title'}).get_text(strip=True)
topic_div = article.find('div', string='Topic')
topic = topic_div.find_next_sibling('div').get_text(strip=True) if topic_div else None
type_of_act_div = article.find('div', string='Type of act')
type_of_act = type_of_act_div.find_next_sibling('div').get_text(strip=True) if type_of_act_div else None
feedback_period_div = article.find('div', string='Feedback period')
feedback_period = feedback_period_div.find_next_sibling('div').get_text(strip=True) if feedback_period_div else None
initiative = {
'headline': headline,
'link': link,
'topic': topic,
'type_of_act': type_of_act,
'feedback_period': feedback_period
}
initiatives.append(initiative)
except Exception as e:
print(f"Error fetching initiative: {e}")
print(article.prettify())
print(html.prettify())
exit()
return initiatives
async def main():
arango = ArangoDB()
if not arango.db.has_collection('eu_initiatives'):
arango.db.create_collection('eu_initiatives')
arango.db.collection('eu_initiatives').truncate()
browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
for status in ['OPEN', 'CLOSED', 'UPCOMING', 'DISABLED']:
print(status)
for page_number in range(0, 335):
initiatives = await get_all_initiatives(browser, page_number, status)
if not initiatives:
break
for initiative in initiatives:
print(initiative['headline'])
initiative['_key'] = arango.fix_key(f"{initiative['headline'].split('_')[0]}")
initiative['meta'] = {'status': status, 'date': datetime.now().strftime('%Y-%m-%d'), 'page': page_number}
initiative = await get_info(browser, initiative, 'initiatives_downloads')
arango.db.collection('eu_initiatives').insert(initiative, overwrite=True)
print(f'Page {page_number} done')
await browser.close()
if __name__ == '__main__':
# arango = ArangoDB()
# initiatives = [i for i in arango.db.collection('eu_initiatives').all()]
# ordered_by_headline = sorted(initiatives, key=lambda x: x['headline'])
# s = set()
# for i in ordered_by_headline:
# s.add(i['headline'])
# print(len(s))
asyncio.get_event_loop().run_until_complete(main())