You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
185 lines
5.5 KiB
185 lines
5.5 KiB
import re |
|
import requests |
|
from datetime import datetime |
|
from random import randint |
|
from time import sleep |
|
|
|
from pymongo import MongoClient |
|
import werkzeug |
|
werkzeug.cached_property = werkzeug.utils.cached_property |
|
from robobrowser import RoboBrowser |
|
import json |
|
|
|
|
|
class Scraper: |
|
def __init__(self): |
|
session = requests.Session() |
|
|
|
# Starta browser |
|
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" |
|
self.browser = RoboBrowser( |
|
session=session, user_agent=user_agent, history=True, parser="lxml" |
|
) |
|
sleep(2) |
|
self.browser.open("https://mrkoll.se/") |
|
|
|
def open(self, url): |
|
self.browser.open(url) |
|
|
|
|
|
def viewing(self): |
|
""" Returnerar browser i html-format """ |
|
return self.browser.parsed |
|
|
|
|
|
def find_person(number, scraper): |
|
|
|
d = {} |
|
if scraper.browser.state.url != "https://mrkoll.se/": |
|
scraper.browser.back() |
|
sleep(randint(2, 3)) |
|
form = scraper.browser.get_form(action="requestSearch/") |
|
form["n"].value = number |
|
sleep(randint(2, 3)) |
|
scraper.browser.submit_form(form) |
|
soup = scraper.viewing() |
|
|
|
d["url_via_telefonnummer"] = scraper.browser.state.url |
|
try: |
|
for a in scraper.viewing().find_all("a", href=True): |
|
if "boende-med-" in a["href"]: |
|
d["lives_with_url"] = a["href"] |
|
if "-hushall" in a["href"]: |
|
d["lives_with"] = a.text |
|
except: |
|
pass |
|
|
|
if "Sökningen gav 0 träffar..." in soup.text: |
|
return {} |
|
elif "Du har gjort för många anrop" in soup or scraper.browser.state.url == "https://mrkoll.se/om/limit/": |
|
return "blocked" |
|
|
|
info = soup.find("div", {"class": "block_col1"}) |
|
|
|
try: |
|
d["first_name"] = info.find( |
|
"span", {"title": "Detta är personens tilltalsnamn"} |
|
).text |
|
except: |
|
pass |
|
try: |
|
d["middle_name"] = info.find("span", {"title": "Detta är ett förnamn"}).text |
|
except: |
|
pass |
|
try: |
|
d["last_name"] = info.find("span", {"title": "Detta är ett efternamn"}).text |
|
except: |
|
pass |
|
try: |
|
adress = info.find_all("span", {"class": "f_line2 pl65 pl65-border"}) |
|
d["adress_line1"] = adress[0].text |
|
if len(adress) > 1: |
|
d["adress_line2"] = adress[1].text |
|
except: |
|
pass |
|
|
|
try: |
|
d["history"] = info.find("div", {"class": "history_container"}).text |
|
except: |
|
pass |
|
# Personnummer |
|
## Födelsedatum |
|
for i in soup.find_all("div", {"class": "col_block1"}): |
|
if "Personnummer" in i.text: |
|
d["date_of_birth"] = i.find("span", {"class": "f_line2"}).text.replace( |
|
"-XXXX", "" |
|
) |
|
## Fyra sista |
|
try: |
|
start = "showPersnr" |
|
end = ">Jag godkänner</span>" |
|
t = str(soup) |
|
v = t[t.find(start) + 11 : t.find(end) - 2].replace("'", "").split(",") |
|
url_ajax = "/ajax/lastDigits/?p=" + v[0] + "&k=" + v[1] |
|
sleep(2) # Vänta lite |
|
four_last = requests.get("http://mrkoll.se" + url_ajax).text |
|
d["personal_number"] = "{dob}-{fl}".format(dob=d["date_of_birth"], fl=four_last) |
|
except: |
|
pass |
|
|
|
try: |
|
neighbours = {} |
|
for div in soup.find_all("div", {"class": "peoplecont"}): |
|
persons = div.find_all("a", href=True) |
|
for person in persons: |
|
neighbours[person.find("strong").text] = { |
|
"link": person["href"], |
|
"lived_years": re.search( |
|
"\d+", person.find("span", {"class": "flyttclass"}).text |
|
).group()[0], |
|
} |
|
d['neighbours'] = neighbours |
|
except: |
|
pass |
|
|
|
try: |
|
d['name_change'] = [div.text.strip() for div in soup.find_all('div', {'class':"name_change"})] |
|
except: |
|
pass |
|
|
|
try: |
|
prosecuted = {} |
|
prosecuted['brottsmål'] = True if soup.find('div', {'class': 'resmark res_b'}) != None else False |
|
prosecuted['tvistemål'] = True if soup.find('div', {'class': 'resmark res_t'}) != None else False |
|
prosecuted['straffföreläggande'] = True if soup.find('div', {'class': 'resmark res_s'}) != None else False |
|
d['prosecuted'] = prosecuted |
|
except: |
|
pass |
|
|
|
return d |
|
|
|
if __name__ == '__main__': |
|
client = MongoClient('mongodb://localhost:27017') |
|
db_client = client['phone_db'] |
|
db = db_client['phone'] |
|
|
|
|
|
leak = db_client['leak'] |
|
print('Nummer kvar att kolla:', leak.count_documents({})) |
|
|
|
|
|
scraper = Scraper() |
|
count = 0 |
|
scraper_count = 0 |
|
while True: |
|
count += 1 |
|
print(count, end="\r") |
|
doc = leak.find_one() |
|
leak.delete_one(doc) |
|
d = find_person(doc["phone"], scraper) |
|
# cursor = leak.aggregate([{'$sample': {'size': leak.estimated_document_count()}}], allowDiskUse=True) |
|
# for doc in cursor: |
|
# print(doc['phone']) |
|
# # Kolla om numret är kollat |
|
# q = { "phone": doc['phone'] } |
|
# if len(list(db.find(q))) == 0: |
|
# d = find_person(doc["phone"], scraper) |
|
# continue |
|
|
|
if datetime.now().strftime("%H") == '01': |
|
sleep(18000) |
|
|
|
sleep(10) |
|
if d == "blocked": |
|
client.close() |
|
print(doc) |
|
print(count, 'blocked') |
|
exit() |
|
|
|
d["_key"] = doc["_key"] |
|
d["_id"] = 'phone/' + str(d["_key"]) |
|
d["phone"] = doc["phone"] |
|
db.insert_one(d) |
|
|
|
|
|
|
|
|