Merge branch 'master' of https://github.com/lasseedfast/fb-scraper
commit
6698ab6faa
20 changed files with 650 additions and 282 deletions
@ -1,18 +1,28 @@ |
||||
|
||||
# Blandat |
||||
/.DS_Store |
||||
/.venv |
||||
*.venv |
||||
/.vscode |
||||
/__pycache__ |
||||
*.json |
||||
*.pkl |
||||
facebook/test.py |
||||
/data/* |
||||
*.html |
||||
*.code-workspace |
||||
workspace.code-workspace |
||||
password_arango.txt |
||||
*.gexf |
||||
facebook/mrkoll. |
||||
*.pyc |
||||
*.sqlite3 |
||||
|
||||
#facebook |
||||
/facebook |
||||
!/facebook/*.py |
||||
*.sqlite3 |
||||
facebook/test.py |
||||
facebook/mrkoll. |
||||
|
||||
# docker |
||||
/stats/* |
||||
!/stats/*.py |
||||
|
||||
requirements2.txt |
||||
|
||||
@ -1,19 +0,0 @@ |
||||
|
||||
FROM python:3.8 |
||||
|
||||
WORKDIR / |
||||
|
||||
COPY requirements.txt . |
||||
|
||||
RUN pip install -r requirements.txt |
||||
|
||||
ADD . . |
||||
|
||||
ENTRYPOINT [ "python", "facebook/__main__.py", "-p free" ] |
||||
|
||||
CMD ["",""] |
||||
|
||||
# BUILD: |
||||
# docker buildx create --use |
||||
#docker buildx build --file docker/free/Dockerfile --platform linux/arm -t l3224/fb-scraper:free --push . |
||||
|
||||
@ -1,14 +1,15 @@ |
||||
# syntax=docker/dockerfile:1 |
||||
|
||||
FROM python:3.8 |
||||
FROM python:3.8-slim-buster |
||||
|
||||
WORKDIR / |
||||
COPY requirements.txt requirements.txt |
||||
|
||||
COPY requirements.txt . |
||||
RUN pip3 install -r requirements.txt |
||||
|
||||
RUN pip install -r requirements.txt |
||||
COPY . . |
||||
|
||||
ADD . . |
||||
ENTRYPOINT [ "python3", "mrkoll_scraperapi.py" ] |
||||
|
||||
ENTRYPOINT [ "python", "facebook/mrkoll.py" ] |
||||
CMD [""] |
||||
|
||||
# docker buildx build --file docker/mrkoll/Dockerfile --platform linux/arm -t l3224/fb-scraper:mrkoll --push . |
||||
@ -0,0 +1,194 @@ |
||||
import re |
||||
import subprocess |
||||
import requests |
||||
from sys import argv |
||||
from time import sleep |
||||
from bs4 import BeautifulSoup |
||||
from arango import ArangoClient |
||||
|
||||
|
||||
|
||||
|
||||
def find_person(number): |
||||
""" |
||||
Söker personuppgifter utifrån telefonnummer. |
||||
""" |
||||
|
||||
sleep(2) |
||||
|
||||
url = f'https://mrkoll.se/resultat?n={number}' |
||||
|
||||
api_key = 'fcfe011cf66fddb61bb6425fcb5cb5e9' |
||||
payload = {'api_key': api_key, 'url': url, 'country_code': 'se', 'device_type':'desktop'} |
||||
|
||||
response = requests.get('http://api.scraperapi.com', params=payload) |
||||
r = response.text |
||||
# Hämta sidan |
||||
|
||||
soup = BeautifulSoup(r, 'html.parser') |
||||
|
||||
if ( |
||||
"Du har gjort för många anrop" in soup.text |
||||
or response.url == "https://mrkoll.se/om/limit/" |
||||
): |
||||
sleep(10) |
||||
return None |
||||
|
||||
# Lägg in data i dictionary |
||||
d = {} |
||||
|
||||
d["url_via_telefonnummer"] = response.url |
||||
try: |
||||
for a in soup.find_all("a", href=True): |
||||
if "boende-med-" in a["href"]: |
||||
d["lives_with_url"] = a["href"] |
||||
if "-hushall" in a["href"]: |
||||
d["lives_with"] = a.text |
||||
except: |
||||
pass |
||||
|
||||
if "Sökningen gav 0 träffar..." in soup.text: |
||||
return {} |
||||
|
||||
info = soup.find("div", {"class": "block_col1"}) |
||||
|
||||
try: |
||||
d["first_name"] = info.find( |
||||
"span", {"title": "Detta är personens tilltalsnamn"} |
||||
).text |
||||
except: |
||||
pass |
||||
try: |
||||
d["middle_name"] = info.find("span", {"title": "Detta är ett förnamn"}).text |
||||
except: |
||||
pass |
||||
try: |
||||
d["last_name"] = info.find("span", {"title": "Detta är ett efternamn"}).text |
||||
except: |
||||
pass |
||||
try: |
||||
adress = info.find_all("span", {"class": "f_line2 pl65 pl65-border"}) |
||||
d["adress_line1"] = adress[0].text |
||||
if len(adress) > 1: |
||||
d["adress_line2"] = adress[1].text |
||||
except: |
||||
pass |
||||
|
||||
try: |
||||
d["history"] = info.find("div", {"class": "history_container"}).text |
||||
except: |
||||
pass |
||||
|
||||
# Personnummer |
||||
## Födelsedatum |
||||
for i in soup.find_all("div", {"class": "col_block1"}): |
||||
if "Personnummer" in i.text: |
||||
d["date_of_birth"] = i.find("span", {"class": "f_line2"}).text.replace( |
||||
"-XXXX", "" |
||||
) |
||||
## Fyra sista |
||||
try: |
||||
start = "showPersnr" |
||||
end = ">Jag godkänner</span>" |
||||
t = str(soup) |
||||
v = t[t.find(start) + 11 : t.find(end) - 2].replace("'", "").split(",") |
||||
url_ajax = "/ajax/lastDigits/?p=" + v[0] + "&k=" + v[1] |
||||
sleep(2) # Vänta lite |
||||
four_last = requests.get("http://mrkoll.se" + url_ajax).text |
||||
d["personal_number"] = "{dob}-{fl}".format(dob=d["date_of_birth"], fl=four_last) |
||||
except: |
||||
pass |
||||
|
||||
try: |
||||
neighbours = {} |
||||
for div in soup.find_all("div", {"class": "peoplecont"}): |
||||
persons = div.find_all("a", href=True) |
||||
for person in persons: |
||||
neighbours[person.find("strong").text] = { |
||||
"link": person["href"], |
||||
"lived_years": re.search( |
||||
"\d+", person.find("span", {"class": "flyttclass"}).text |
||||
).group()[0], |
||||
} |
||||
d["neighbours"] = neighbours |
||||
except: |
||||
pass |
||||
|
||||
try: |
||||
d["name_change"] = [ |
||||
div.text.strip() for div in soup.find_all("div", {"class": "name_change"}) |
||||
] |
||||
except: |
||||
pass |
||||
|
||||
try: |
||||
prosecuted = {} |
||||
prosecuted["brottsmål"] = ( |
||||
True if soup.find("div", {"class": "resmark res_b"}) != None else False |
||||
) |
||||
prosecuted["tvistemål"] = ( |
||||
True if soup.find("div", {"class": "resmark res_t"}) != None else False |
||||
) |
||||
prosecuted["straffföreläggande"] = ( |
||||
True if soup.find("div", {"class": "resmark res_s"}) != None else False |
||||
) |
||||
d["prosecuted"] = prosecuted |
||||
except: |
||||
pass |
||||
|
||||
return d |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
|
||||
ip = 'scraperapi' |
||||
|
||||
if requests.get('https://icanhazip.com').text.strip() == '98.128.172.12': |
||||
print('\nMULLVAD INTE AKTIV\n') |
||||
exit() |
||||
|
||||
# Info för arangodb |
||||
user_arango = "Phone" |
||||
db_arango = "facebook" |
||||
host_arango = "http://192.168.1.10:8529" |
||||
|
||||
# Starta koppling till arangodb |
||||
|
||||
db = ArangoClient(hosts=host_arango).db( |
||||
db_arango, username=user_arango, password=argv[1] |
||||
) |
||||
leak = db.collection("phoneleak") |
||||
|
||||
count = 0 |
||||
scraper_count = 0 |
||||
|
||||
global errors |
||||
errors = 0 |
||||
|
||||
while True: |
||||
count += 1 |
||||
|
||||
# Hämta en random person |
||||
doc = leak.random() |
||||
|
||||
# Gör sökningen på mrkoll.se |
||||
d = find_person(doc["phone"]) |
||||
|
||||
try: |
||||
name = d["first_name"] + ' ' |
||||
except: |
||||
name = ' ' |
||||
print(f'{count} - {errors} {name}', end="\r") |
||||
|
||||
if d == None: # Om ip-adressen är blockad eller något hänt |
||||
continue |
||||
|
||||
d["_key"] = doc["_key"] |
||||
d["_id"] = "phone/" + str(d["_key"]) |
||||
d["phone"] = doc["phone"] |
||||
d["checked_from_ip"] = f'{ip} - cache' |
||||
try: |
||||
db.collection("phone").insert(d) |
||||
leak.delete(doc["_key"]) |
||||
except: |
||||
pass |
||||
@ -0,0 +1,14 @@ |
||||
beautifulsoup4==4.9.3 |
||||
bs4==0.0.1 |
||||
certifi==2021.5.30 |
||||
charset-normalizer==2.0.4 |
||||
idna==3.2 |
||||
PyJWT==2.1.0 |
||||
python-arango==7.2.0 |
||||
requests==2.26.0 |
||||
requests-toolbelt==0.9.1 |
||||
setuptools-scm==6.0.1 |
||||
soupsieve==2.2.1 |
||||
toml==0.10.2 |
||||
urllib3==1.26.6 |
||||
requests_cache==0.7.4 |
||||
@ -0,0 +1,70 @@ |
||||
import requests |
||||
import os |
||||
from datetime import date, datetime, timedelta |
||||
from time import sleep |
||||
|
||||
from arangodb import db |
||||
|
||||
|
||||
def download_image(url, user, id): |
||||
|
||||
# Kolla så användarmappen finns |
||||
if not os.path.isdir(f'../profile_pictures/{user}'): |
||||
os.mkdir(f'../profile_pictures/{user}') |
||||
|
||||
# Ladda ner bilden |
||||
r = requests.get(url) |
||||
if r.text == 'URL signature expired': |
||||
print('För gammal länk.') |
||||
exit() |
||||
elif r.status_code == 403: |
||||
exit() |
||||
img_data = r.content |
||||
with open(f'../profile_pictures/{user}/{id}.jpg', 'wb') as handler: |
||||
handler.write(img_data) |
||||
|
||||
|
||||
def get_pictures(day): |
||||
cursor = db.aql.execute( |
||||
""" |
||||
for doc in members |
||||
filter doc.fetched == @date |
||||
filter has(doc, "checked_pictures") |
||||
filter not has(doc, "pictures_downloaded") |
||||
return {'member': doc._key, 'pictures':doc.checked_pictures} |
||||
""", |
||||
bind_vars={'date': day} |
||||
) |
||||
|
||||
for doc in cursor: |
||||
pictures = [] |
||||
for picture in doc['pictures']: |
||||
pictures.append(picture[picture.find('fbid=')+5:]) |
||||
|
||||
|
||||
cursor = db.aql.execute( |
||||
""" |
||||
for doc in pictures |
||||
filter doc._key in @list |
||||
limit 10 |
||||
return {'_key': doc._key, 'user':doc.user, 'url': doc.src} |
||||
""", |
||||
bind_vars={"list": pictures}, |
||||
) |
||||
|
||||
for picture in cursor: |
||||
download_image(picture['url'], picture['user'], picture['_key']) |
||||
print(picture['_key']) |
||||
sleep(2) |
||||
|
||||
db.update_document({'_id': 'members/' + str(doc['member']), 'pictures_downloaded': True}, silent=True, check_rev=False) |
||||
|
||||
def old_pics(): |
||||
if not os.path.isdir(f'../profile_pictures'): |
||||
os.mkdir(f'../profile_pictures') |
||||
start = date.today() |
||||
for i in range(1,60): |
||||
d = start - timedelta(days=i) |
||||
get_pictures(d.strftime('%Y%m%d')) |
||||
|
||||
|
||||
@ -0,0 +1,15 @@ |
||||
FROM python:alpine |
||||
|
||||
WORKDIR / |
||||
|
||||
RUN apk add --update --no-cache g++ gcc libxslt-dev |
||||
|
||||
COPY requirements.txt . |
||||
|
||||
RUN pip install -r requirements.txt |
||||
|
||||
ADD . . |
||||
|
||||
ENTRYPOINT [ "python", "stats.py" ] |
||||
|
||||
# docker buildx build --file docker/stats/Dockerfile --platform linux/arm64,linux/amd64 -t mrkoll . |
||||
@ -0,0 +1,27 @@ |
||||
black==21.8b0 |
||||
certifi==2020.6.20 |
||||
chardet==4.0.0 |
||||
click==8.0.1 |
||||
httplib2==0.18.1 |
||||
idna==2.10 |
||||
mypy-extensions==0.4.3 |
||||
packaging==21.0 |
||||
pathspec==0.9.0 |
||||
platformdirs==2.3.0 |
||||
#pycurl==7.43.0.6 |
||||
PyJWT==2.1.0 |
||||
pyparsing==2.4.7 |
||||
PySimpleSOAP==1.16.2 |
||||
#python-apt==2.2.1 |
||||
python-arango==7.2.0 |
||||
python-debian==0.1.39 |
||||
python-debianbts==3.1.0 |
||||
regex==2021.8.28 |
||||
#reportbug==7.10.3 |
||||
requests==2.25.1 |
||||
requests-toolbelt==0.9.1 |
||||
setuptools-scm==6.3.1 |
||||
six==1.16.0 |
||||
tomli==1.2.1 |
||||
typing-extensions==3.10.0.2 |
||||
urllib3==1.26.5 |
||||
Loading…
Reference in new issue