parent
e3d6933702
commit
4147b7185d
1 changed files with 163 additions and 0 deletions
@ -0,0 +1,163 @@ |
|||||||
|
import re |
||||||
|
import json |
||||||
|
import asyncio |
||||||
|
import os |
||||||
|
from pyppeteer import launch |
||||||
|
from bs4 import BeautifulSoup |
||||||
|
from _arango import ArangoDB |
||||||
|
from datetime import datetime |
||||||
|
import random |
||||||
|
from colorprinter.print_color import * |
||||||
|
|
||||||
|
start = datetime.now() |
||||||
|
|
||||||
|
def sanitize_filename(filename): |
||||||
|
return re.sub(r'[\\/*?:"<>|]', "_", filename) |
||||||
|
|
||||||
|
async def get_info(browser, doc, download_path): |
||||||
|
page = await browser.newPage() |
||||||
|
await page._client.send('Page.setDownloadBehavior', { |
||||||
|
'behavior': 'allow', |
||||||
|
'downloadPath': download_path |
||||||
|
}) |
||||||
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') |
||||||
|
|
||||||
|
await page.goto(doc['link'], {'waitUntil': 'networkidle2'}) |
||||||
|
await asyncio.sleep(2) |
||||||
|
content = await page.content() |
||||||
|
|
||||||
|
# Locate and click the download button |
||||||
|
try: |
||||||
|
await page.click('a.ecl-file__download') # Adjust the selector as needed |
||||||
|
await asyncio.sleep(5) # Wait for the download to complete |
||||||
|
except Exception as e: |
||||||
|
print(f"Error clicking download button: {e}") |
||||||
|
|
||||||
|
await page.close() |
||||||
|
|
||||||
|
# Rename the downloaded file |
||||||
|
for filename in os.listdir(download_path): |
||||||
|
if filename.endswith(".pdf") and '_' not in filename: # Adjust the file extension as needed |
||||||
|
old_file = os.path.join(download_path, filename) |
||||||
|
new_file = os.path.join(download_path, f"{doc['_key']}.pdf") |
||||||
|
try: |
||||||
|
# Check if the new file already exists |
||||||
|
if os.path.exists(new_file): |
||||||
|
os.remove(new_file) |
||||||
|
os.rename(old_file, new_file) |
||||||
|
except: |
||||||
|
try: |
||||||
|
os.rename(old_file, new_file[:50]) |
||||||
|
except: |
||||||
|
os.remove(old_file) |
||||||
|
doc['meta']['file_path_doc'] = new_file |
||||||
|
break |
||||||
|
|
||||||
|
soup = BeautifulSoup(content, 'html.parser') |
||||||
|
|
||||||
|
consultation_button = soup.find('span', {'class': 'ecl-ecl-button__container'}) |
||||||
|
if consultation_button: |
||||||
|
doc['consultation_link'] = f"https://ec.europa.eu{doc['link']}/public-consultation_en" |
||||||
|
else: |
||||||
|
doc['consultation_link'] = None |
||||||
|
summary_div = soup.find('div', {'class': 'initiative-detail-summary'}) |
||||||
|
if summary_div: |
||||||
|
doc['summary'] = summary_div.get_text(strip=True) |
||||||
|
else: |
||||||
|
doc['summary'] = '' |
||||||
|
|
||||||
|
for link in soup.find_all('a'): |
||||||
|
if 'All feedback and statistics' in link.get_text(): |
||||||
|
doc['feedback_url'] = f"https://ec.europa.eu{link['href']}" |
||||||
|
break |
||||||
|
|
||||||
|
return doc |
||||||
|
|
||||||
|
async def get_all_initiatives(browser, page_number: int, status): |
||||||
|
page = await browser.newPage() |
||||||
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36') |
||||||
|
|
||||||
|
url = f'https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives_en?feedbackStatus={status}&page={page_number}' |
||||||
|
|
||||||
|
await page.goto(url, {'waitUntil': 'networkidle2'}) |
||||||
|
await asyncio.sleep(2) |
||||||
|
content = await page.content() |
||||||
|
await page.close() |
||||||
|
|
||||||
|
soup = BeautifulSoup(content, 'html.parser') |
||||||
|
|
||||||
|
html = soup.find_all('div', {'class': 'ux-block-content'})[0] |
||||||
|
|
||||||
|
|
||||||
|
articles = html.find_all('article', {'class': 'search-result-item'}) |
||||||
|
if articles == []: |
||||||
|
return None |
||||||
|
initiatives = [] |
||||||
|
for article in articles: |
||||||
|
try: |
||||||
|
link_tag = article.find('a', {'class': 'ecl-u-pt-xs ecl-u-type-none'}) |
||||||
|
link = f"https://ec.europa.eu{link_tag['href']}" |
||||||
|
headline = link_tag.find('div', {'class': 'search-result-title'}).get_text(strip=True) |
||||||
|
|
||||||
|
topic_div = article.find('div', string='Topic') |
||||||
|
topic = topic_div.find_next_sibling('div').get_text(strip=True) if topic_div else None |
||||||
|
|
||||||
|
type_of_act_div = article.find('div', string='Type of act') |
||||||
|
type_of_act = type_of_act_div.find_next_sibling('div').get_text(strip=True) if type_of_act_div else None |
||||||
|
|
||||||
|
feedback_period_div = article.find('div', string='Feedback period') |
||||||
|
feedback_period = feedback_period_div.find_next_sibling('div').get_text(strip=True) if feedback_period_div else None |
||||||
|
|
||||||
|
initiative = { |
||||||
|
'headline': headline, |
||||||
|
'link': link, |
||||||
|
'topic': topic, |
||||||
|
'type_of_act': type_of_act, |
||||||
|
'feedback_period': feedback_period |
||||||
|
} |
||||||
|
initiatives.append(initiative) |
||||||
|
except Exception as e: |
||||||
|
print(f"Error fetching initiative: {e}") |
||||||
|
print(article.prettify()) |
||||||
|
print(html.prettify()) |
||||||
|
exit() |
||||||
|
return initiatives |
||||||
|
|
||||||
|
async def main(): |
||||||
|
arango = ArangoDB() |
||||||
|
if not arango.db.has_collection('eu_initiatives'): |
||||||
|
arango.db.create_collection('eu_initiatives') |
||||||
|
arango.db.collection('eu_initiatives').truncate() |
||||||
|
browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox']) |
||||||
|
|
||||||
|
for status in ['OPEN', 'CLOSED', 'UPCOMING', 'DISABLED']: |
||||||
|
print(status) |
||||||
|
for page_number in range(0, 335): |
||||||
|
initiatives = await get_all_initiatives(browser, page_number, status) |
||||||
|
if not initiatives: |
||||||
|
break |
||||||
|
for initiative in initiatives: |
||||||
|
print(initiative['headline']) |
||||||
|
initiative['_key'] = arango.fix_key(f"{initiative['headline'].split('_')[0]}") |
||||||
|
initiative['meta'] = {'status': status, 'date': datetime.now().strftime('%Y-%m-%d'), 'page': page_number} |
||||||
|
initiative = await get_info(browser, initiative, 'initiatives_downloads') |
||||||
|
arango.db.collection('eu_initiatives').insert(initiative, overwrite=True) |
||||||
|
print(f'Page {page_number} done') |
||||||
|
|
||||||
|
await browser.close() |
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__': |
||||||
|
|
||||||
|
# arango = ArangoDB() |
||||||
|
# initiatives = [i for i in arango.db.collection('eu_initiatives').all()] |
||||||
|
# ordered_by_headline = sorted(initiatives, key=lambda x: x['headline']) |
||||||
|
|
||||||
|
# s = set() |
||||||
|
# for i in ordered_by_headline: |
||||||
|
|
||||||
|
# s.add(i['headline']) |
||||||
|
# print(len(s)) |
||||||
|
|
||||||
|
|
||||||
|
asyncio.get_event_loop().run_until_complete(main()) |
||||||
Loading…
Reference in new issue