You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
1.1 KiB
38 lines
1.1 KiB
import bs4 |
|
import requests |
|
from time import sleep |
|
from arango_client import arango |
|
|
|
|
|
for y in range(0, 26): |
|
# Make y a two-digit string |
|
y1 = str(y).zfill(2) |
|
y2 = str(y + 1).zfill(2) |
|
page = 1 |
|
while True: |
|
print(f"Fetching page {page}...") |
|
url = f"https://www.riksdagen.se/sv/sok/?avd=webbtv&riksmote=20{y1}%2F{y2}&p={page}" |
|
response = requests.get(url) |
|
soup = bs4.BeautifulSoup(response.content, "html.parser") |
|
# Get all links to videos |
|
links = soup.find_all("a") |
|
arango_docs = [] |
|
video_links = [ |
|
link["href"] |
|
for link in links |
|
if "https://www.riksdagen.se/sv/webb-tv/" in link.get("href", "") |
|
] |
|
if not video_links: |
|
break |
|
n = 0 |
|
for link in video_links: |
|
if '_' in link: |
|
l = link.split('_')[-1].replace('/', '') |
|
arango_docs.append({"_key": l, "url": link}) |
|
n += 1 |
|
arango.db.collection("webb_tv").insert_many(arango_docs, overwrite=True) |
|
arango_docs = [] |
|
sleep(2) |
|
page += 1 |
|
|
|
print("Done.") |