Refactor scrape_initiatives.py to improve code structure and readability

1 year ago · 4147b7185d
parent e3d6933702
commit 4147b7185d
1 changed files with 163 additions and 0 deletions
--- a/scrape_initiatives.py
+++ b/scrape_initiatives.py
@ -0,0 +1,163 @@
 import re
 import json
 import asyncio
 import os
 from pyppeteer import launch
 from bs4 import BeautifulSoup
 from _arango import ArangoDB
 from datetime import datetime
 import random
 from colorprinter.print_color import *
 start = datetime.now()
 def sanitize_filename(filename):
    return re.sub(r'[\\/*?:"<>|]', "_", filename)
 async def get_info(browser, doc, download_path):
    page = await browser.newPage()
    await page._client.send('Page.setDownloadBehavior', {
        'behavior': 'allow',
        'downloadPath': download_path
    })
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
    await page.goto(doc['link'], {'waitUntil': 'networkidle2'})
    await asyncio.sleep(2)
    content = await page.content()
    # Locate and click the download button
    try:
        await page.click('a.ecl-file__download')  # Adjust the selector as needed
        await asyncio.sleep(5)  # Wait for the download to complete
    except Exception as e:
        print(f"Error clicking download button: {e}")
    await page.close()
    # Rename the downloaded file
    for filename in os.listdir(download_path):
        if filename.endswith(".pdf") and '_' not in filename:  # Adjust the file extension as needed
            old_file = os.path.join(download_path, filename)
            new_file = os.path.join(download_path, f"{doc['_key']}.pdf")
            try:
                # Check if the new file already exists
                if os.path.exists(new_file):
                    os.remove(new_file)
                os.rename(old_file, new_file)
            except:
                try:
                    os.rename(old_file, new_file[:50])
                except:
                    os.remove(old_file)
            doc['meta']['file_path_doc'] = new_file
            break
    soup = BeautifulSoup(content, 'html.parser')
    consultation_button = soup.find('span', {'class': 'ecl-ecl-button__container'})
    if consultation_button:
        doc['consultation_link'] = f"https://ec.europa.eu{doc['link']}/public-consultation_en"
    else:
        doc['consultation_link'] = None
    summary_div = soup.find('div', {'class': 'initiative-detail-summary'})
    if summary_div:
        doc['summary'] = summary_div.get_text(strip=True)
    else:
        doc['summary'] = ''
    for link in soup.find_all('a'):
        if 'All feedback and statistics' in link.get_text():
            doc['feedback_url'] = f"https://ec.europa.eu{link['href']}"
            break
    return doc
 async def get_all_initiatives(browser, page_number: int, status):
    page = await browser.newPage()
    await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36')
    url = f'https://ec.europa.eu/info/law/better-regulation/have-your-say/initiatives_en?feedbackStatus={status}&page={page_number}'
    await page.goto(url, {'waitUntil': 'networkidle2'})
    await asyncio.sleep(2)
    content = await page.content()
    await page.close()
    soup = BeautifulSoup(content, 'html.parser')
    html = soup.find_all('div', {'class': 'ux-block-content'})[0]
    articles = html.find_all('article', {'class': 'search-result-item'})
    if articles == []:
        return None
    initiatives = []
    for article in articles:
        try:
            link_tag = article.find('a', {'class': 'ecl-u-pt-xs ecl-u-type-none'})
            link = f"https://ec.europa.eu{link_tag['href']}"
            headline = link_tag.find('div', {'class': 'search-result-title'}).get_text(strip=True)
            topic_div = article.find('div', string='Topic')
            topic = topic_div.find_next_sibling('div').get_text(strip=True) if topic_div else None
            type_of_act_div = article.find('div', string='Type of act')
            type_of_act = type_of_act_div.find_next_sibling('div').get_text(strip=True) if type_of_act_div else None
            feedback_period_div = article.find('div', string='Feedback period')
            feedback_period = feedback_period_div.find_next_sibling('div').get_text(strip=True) if feedback_period_div else None
            initiative = {
                'headline': headline,
                'link': link,
                'topic': topic,
                'type_of_act': type_of_act,
                'feedback_period': feedback_period
            }
            initiatives.append(initiative)
        except Exception as e:
            print(f"Error fetching initiative: {e}")
            print(article.prettify())
            print(html.prettify())
            exit()
    return initiatives
 async def main():
    arango = ArangoDB()
    if not arango.db.has_collection('eu_initiatives'):
        arango.db.create_collection('eu_initiatives')
    arango.db.collection('eu_initiatives').truncate()
    browser = await launch(headless=True, args=['--no-sandbox', '--disable-setuid-sandbox'])
    for status in ['OPEN', 'CLOSED', 'UPCOMING', 'DISABLED']:
        print(status)
        for page_number in range(0, 335):
            initiatives = await get_all_initiatives(browser, page_number, status)
            if not initiatives:
                break
            for initiative in initiatives:
                print(initiative['headline'])
                initiative['_key'] = arango.fix_key(f"{initiative['headline'].split('_')[0]}")
                initiative['meta'] = {'status': status, 'date': datetime.now().strftime('%Y-%m-%d'), 'page': page_number}
                initiative = await get_info(browser, initiative, 'initiatives_downloads')
                arango.db.collection('eu_initiatives').insert(initiative, overwrite=True)
            print(f'Page {page_number} done')
    await browser.close()
 if __name__ == '__main__':
    # arango = ArangoDB()
    # initiatives = [i for i in arango.db.collection('eu_initiatives').all()]
    # ordered_by_headline = sorted(initiatives, key=lambda x: x['headline'])
    # s = set()
    # for i in ordered_by_headline:
    #     s.add(i['headline'])
    # print(len(s))
    asyncio.get_event_loop().run_until_complete(main())