import re import requests import json from requests.auth import HTTPProxyAuth from time import sleep from getpass import getpass import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) import socket socket.setdefaulttimeout(20) import ssl ssl._create_default_https_context = ssl._create_unverified_context import urllib import urllib.request as request from bs4 import BeautifulSoup from arango import ArangoClient from servers_oxylabs import servers def find_person(number, errors, server): password = 'T8ARbTg6qY' user = 'edfast' ip = server['ip'] proxy = (f'http://{user}:{password}@{ip}:6000') url = f'https://mrkoll.se/resultat?n={number}' user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" headers = {'User-agent': user_agent} query = request.build_opener(request.ProxyHandler({'https': proxy})) req = request.Request(url, headers=headers) n = 0 n += 1 if n == 3: return None, errors try: sleep(2) response = query.open(req) r = response.read().decode() except (urllib.error.HTTPError, socket.timeout) as e: print(e) sleep(2) errors += 1 return None, errors soup = BeautifulSoup(r, 'html.parser') if ( "Du har gjort för många anrop" in soup.text or response.geturl() == "https://mrkoll.se/om/limit/" # TODO Hur får man url från r med urllib3? ): errors += 1 return None, errors # Lägg in data i dictionary d = {} d["url_via_telefonnummer"] = response.geturl() try: for a in soup.find_all("a", href=True): if "boende-med-" in a["href"]: d["lives_with_url"] = a["href"] if "-hushall" in a["href"]: d["lives_with"] = a.text except: pass if "Sökningen gav 0 träffar..." in soup.text: return {}, errors info = soup.find("div", {"class": "block_col1"}) try: d["first_name"] = info.find( "span", {"title": "Detta är personens tilltalsnamn"} ).text except: pass try: d["middle_name"] = info.find("span", {"title": "Detta är ett förnamn"}).text except: pass try: d["last_name"] = info.find("span", {"title": "Detta är ett efternamn"}).text except: pass try: adress = info.find_all("span", {"class": "f_line2 pl65 pl65-border"}) d["adress_line1"] = adress[0].text if len(adress) > 1: d["adress_line2"] = adress[1].text except: pass try: d["history"] = info.find("div", {"class": "history_container"}).text except: pass # Personnummer ## Födelsedatum for i in soup.find_all("div", {"class": "col_block1"}): if "Personnummer" in i.text: d["date_of_birth"] = i.find("span", {"class": "f_line2"}).text.replace( "-XXXX", "" ) ## Fyra sista try: start = "showPersnr" end = ">Jag godkänner" t = str(soup) v = t[t.find(start) + 11 : t.find(end) - 2].replace("'", "").split(",") url_ajax = "/ajax/lastDigits/?p=" + v[0] + "&k=" + v[1] sleep(2) # Vänta lite four_last = requests.get("http://mrkoll.se" + url_ajax).text d["personal_number"] = "{dob}-{fl}".format(dob=d["date_of_birth"], fl=four_last) except: pass try: neighbours = {} for div in soup.find_all("div", {"class": "peoplecont"}): persons = div.find_all("a", href=True) for person in persons: neighbours[person.find("strong").text] = { "link": person["href"], "lived_years": re.search( "\d+", person.find("span", {"class": "flyttclass"}).text ).group()[0], } d["neighbours"] = neighbours except: pass try: d["name_change"] = [ div.text.strip() for div in soup.find_all("div", {"class": "name_change"}) ] except: pass try: prosecuted = {} prosecuted["brottsmål"] = ( True if soup.find("div", {"class": "resmark res_b"}) != None else False ) prosecuted["tvistemål"] = ( True if soup.find("div", {"class": "resmark res_t"}) != None else False ) prosecuted["straffföreläggande"] = ( True if soup.find("div", {"class": "resmark res_s"}) != None else False ) d["prosecuted"] = prosecuted except: pass return d, errors if __name__ == "__main__": # proxies = { 'https': 'https://il061376:"typical-humidify-upheave-aback-rusty"@lexvpn.integrity.st:1723' } servers_json = servers # Info för arangodb user_arango = "Phone" db_arango = "facebook" host_arango = "http://192.168.1.10:8529" # Starta koppling till arangodb # Avkryptera lösen till arango pwd = getpass('Arangolösenord för Phone:').strip() db = ArangoClient(hosts=host_arango).db( db_arango, username=user_arango, password=pwd ) leak = db.collection("phoneleak") count = 0 scraper_count = 0 global errors errors = 0 while True: for server in servers_json: count += 1 # Hämta en random person doc = leak.random() # Gör sökningen på mrkoll.se d, errors = find_person(doc["phone"], errors, server) print(f'{count} - {errors}', end="\r") sleep(2) if d == None: # Om ip-adressen är blockad eller något hänt continue d["_key"] = doc["_key"] d["_id"] = "phone/" + str(d["_key"]) d["phone"] = doc["phone"] d["checked_from_ip"] = 'oxylabs' try: db.collection("phone").insert(d) leak.delete(doc["_key"]) except: pass