import re import requests from datetime import datetime from random import randint from time import sleep from pymongo import MongoClient import werkzeug werkzeug.cached_property = werkzeug.utils.cached_property from robobrowser import RoboBrowser import json class Scraper: def __init__(self): session = requests.Session() # Starta browser user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" self.browser = RoboBrowser( session=session, user_agent=user_agent, history=True, parser="lxml" ) sleep(2) self.browser.open("https://mrkoll.se/") def open(self, url): self.browser.open(url) def viewing(self): """ Returnerar browser i html-format """ return self.browser.parsed def find_person(number, scraper): d = {} if scraper.browser.state.url != "https://mrkoll.se/": scraper.browser.back() sleep(randint(2, 3)) form = scraper.browser.get_form(action="requestSearch/") form["n"].value = number sleep(randint(2, 3)) scraper.browser.submit_form(form) soup = scraper.viewing() d["url_via_telefonnummer"] = scraper.browser.state.url try: for a in scraper.viewing().find_all("a", href=True): if "boende-med-" in a["href"]: d["lives_with_url"] = a["href"] if "-hushall" in a["href"]: d["lives_with"] = a.text except: pass if "Sökningen gav 0 träffar..." in soup.text: return {} elif "Du har gjort för många anrop" in soup or scraper.browser.state.url == "https://mrkoll.se/om/limit/": return "blocked" info = soup.find("div", {"class": "block_col1"}) try: d["first_name"] = info.find( "span", {"title": "Detta är personens tilltalsnamn"} ).text except: pass try: d["middle_name"] = info.find("span", {"title": "Detta är ett förnamn"}).text except: pass try: d["last_name"] = info.find("span", {"title": "Detta är ett efternamn"}).text except: pass try: adress = info.find_all("span", {"class": "f_line2 pl65 pl65-border"}) d["adress_line1"] = adress[0].text if len(adress) > 1: d["adress_line2"] = adress[1].text except: pass try: d["history"] = info.find("div", {"class": "history_container"}).text except: pass # Personnummer ## Födelsedatum for i in soup.find_all("div", {"class": "col_block1"}): if "Personnummer" in i.text: d["date_of_birth"] = i.find("span", {"class": "f_line2"}).text.replace( "-XXXX", "" ) ## Fyra sista try: start = "showPersnr" end = ">Jag godkänner" t = str(soup) v = t[t.find(start) + 11 : t.find(end) - 2].replace("'", "").split(",") url_ajax = "/ajax/lastDigits/?p=" + v[0] + "&k=" + v[1] sleep(2) # Vänta lite four_last = requests.get("http://mrkoll.se" + url_ajax).text d["personal_number"] = "{dob}-{fl}".format(dob=d["date_of_birth"], fl=four_last) except: pass try: neighbours = {} for div in soup.find_all("div", {"class": "peoplecont"}): persons = div.find_all("a", href=True) for person in persons: neighbours[person.find("strong").text] = { "link": person["href"], "lived_years": re.search( "\d+", person.find("span", {"class": "flyttclass"}).text ).group()[0], } d['neighbours'] = neighbours except: pass try: d['name_change'] = [div.text.strip() for div in soup.find_all('div', {'class':"name_change"})] except: pass try: prosecuted = {} prosecuted['brottsmål'] = True if soup.find('div', {'class': 'resmark res_b'}) != None else False prosecuted['tvistemål'] = True if soup.find('div', {'class': 'resmark res_t'}) != None else False prosecuted['straffföreläggande'] = True if soup.find('div', {'class': 'resmark res_s'}) != None else False d['prosecuted'] = prosecuted except: pass return d if __name__ == '__main__': client = MongoClient('mongodb://localhost:27017') db_client = client['phone_db'] db = db_client['phone'] leak = db_client['leak'] print('Nummer kvar att kolla:', leak.count_documents({})) scraper = Scraper() count = 0 scraper_count = 0 while True: count += 1 print(count, end="\r") doc = leak.find_one() leak.delete_one(doc) d = find_person(doc["phone"], scraper) # cursor = leak.aggregate([{'$sample': {'size': leak.estimated_document_count()}}], allowDiskUse=True) # for doc in cursor: # print(doc['phone']) # # Kolla om numret är kollat # q = { "phone": doc['phone'] } # if len(list(db.find(q))) == 0: # d = find_person(doc["phone"], scraper) # continue if datetime.now().strftime("%H") == '01': sleep(18000) sleep(10) if d == "blocked": client.close() print(doc) print(count, 'blocked') exit() d["_key"] = doc["_key"] d["_id"] = 'phone/' + str(d["_key"]) d["phone"] = doc["phone"] db.insert_one(d)