You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

185 lines
5.5 KiB

import re
import requests
from datetime import datetime
from random import randint
from time import sleep
from pymongo import MongoClient
import werkzeug
werkzeug.cached_property = werkzeug.utils.cached_property
from robobrowser import RoboBrowser
import json
class Scraper:
def __init__(self):
session = requests.Session()
# Starta browser
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
self.browser = RoboBrowser(
session=session, user_agent=user_agent, history=True, parser="lxml"
)
sleep(2)
self.browser.open("https://mrkoll.se/")
def open(self, url):
self.browser.open(url)
def viewing(self):
""" Returnerar browser i html-format """
return self.browser.parsed
def find_person(number, scraper):
d = {}
if scraper.browser.state.url != "https://mrkoll.se/":
scraper.browser.back()
sleep(randint(2, 3))
form = scraper.browser.get_form(action="requestSearch/")
form["n"].value = number
sleep(randint(2, 3))
scraper.browser.submit_form(form)
soup = scraper.viewing()
d["url_via_telefonnummer"] = scraper.browser.state.url
try:
for a in scraper.viewing().find_all("a", href=True):
if "boende-med-" in a["href"]:
d["lives_with_url"] = a["href"]
if "-hushall" in a["href"]:
d["lives_with"] = a.text
except:
pass
if "Sökningen gav 0 träffar..." in soup.text:
return {}
elif "Du har gjort för många anrop" in soup or scraper.browser.state.url == "https://mrkoll.se/om/limit/":
return "blocked"
info = soup.find("div", {"class": "block_col1"})
try:
d["first_name"] = info.find(
"span", {"title": "Detta är personens tilltalsnamn"}
).text
except:
pass
try:
d["middle_name"] = info.find("span", {"title": "Detta är ett förnamn"}).text
except:
pass
try:
d["last_name"] = info.find("span", {"title": "Detta är ett efternamn"}).text
except:
pass
try:
adress = info.find_all("span", {"class": "f_line2 pl65 pl65-border"})
d["adress_line1"] = adress[0].text
if len(adress) > 1:
d["adress_line2"] = adress[1].text
except:
pass
try:
d["history"] = info.find("div", {"class": "history_container"}).text
except:
pass
# Personnummer
## Födelsedatum
for i in soup.find_all("div", {"class": "col_block1"}):
if "Personnummer" in i.text:
d["date_of_birth"] = i.find("span", {"class": "f_line2"}).text.replace(
"-XXXX", ""
)
## Fyra sista
try:
start = "showPersnr"
end = ">Jag godkänner</span>"
t = str(soup)
v = t[t.find(start) + 11 : t.find(end) - 2].replace("'", "").split(",")
url_ajax = "/ajax/lastDigits/?p=" + v[0] + "&k=" + v[1]
sleep(2) # Vänta lite
four_last = requests.get("http://mrkoll.se" + url_ajax).text
d["personal_number"] = "{dob}-{fl}".format(dob=d["date_of_birth"], fl=four_last)
except:
pass
try:
neighbours = {}
for div in soup.find_all("div", {"class": "peoplecont"}):
persons = div.find_all("a", href=True)
for person in persons:
neighbours[person.find("strong").text] = {
"link": person["href"],
"lived_years": re.search(
"\d+", person.find("span", {"class": "flyttclass"}).text
).group()[0],
}
d['neighbours'] = neighbours
except:
pass
try:
d['name_change'] = [div.text.strip() for div in soup.find_all('div', {'class':"name_change"})]
except:
pass
try:
prosecuted = {}
prosecuted['brottsmål'] = True if soup.find('div', {'class': 'resmark res_b'}) != None else False
prosecuted['tvistemål'] = True if soup.find('div', {'class': 'resmark res_t'}) != None else False
prosecuted['straffföreläggande'] = True if soup.find('div', {'class': 'resmark res_s'}) != None else False
d['prosecuted'] = prosecuted
except:
pass
return d
if __name__ == '__main__':
client = MongoClient('mongodb://localhost:27017')
db_client = client['phone_db']
db = db_client['phone']
leak = db_client['leak']
print('Nummer kvar att kolla:', leak.count_documents({}))
scraper = Scraper()
count = 0
scraper_count = 0
while True:
count += 1
print(count, end="\r")
doc = leak.find_one()
leak.delete_one(doc)
d = find_person(doc["phone"], scraper)
# cursor = leak.aggregate([{'$sample': {'size': leak.estimated_document_count()}}], allowDiskUse=True)
# for doc in cursor:
# print(doc['phone'])
# # Kolla om numret är kollat
# q = { "phone": doc['phone'] }
# if len(list(db.find(q))) == 0:
# d = find_person(doc["phone"], scraper)
# continue
if datetime.now().strftime("%H") == '01':
sleep(18000)
sleep(10)
if d == "blocked":
client.close()
print(doc)
print(count, 'blocked')
exit()
d["_key"] = doc["_key"]
d["_id"] = 'phone/' + str(d["_key"])
d["phone"] = doc["phone"]
db.insert_one(d)