diff --git a/resources/scripts/scrapers/hudebnibazar.py b/resources/scripts/scrapers/hudebnibazar.py new file mode 100644 index 000000000..dbbdc096c --- /dev/null +++ b/resources/scripts/scrapers/hudebnibazar.py @@ -0,0 +1,124 @@ +# Obtains listings from hudebnibazar.cz and outputs them as JSON feed. +# +# How to call: python3 hudebnibazar.cz +# For example: python3 hudebnibazar.cz "elektricke-kytary/110100" 4 + +import urllib.request +import requests +import re +import json +import sys +import ssl +import http.client +import http.cookies +import dateparser +import bs4 +import datetime + +# ssl._DEFAULT_CIPHERS = "TLS_RSA_WITH_AES_256_GCM_SHA384" +category = sys.argv[1] +number_of_pages = int(sys.argv[2]) + +url_base = "https://hudebnibazar.cz" +url = "{}/{}/?is=1&f=&n=vse&r=&i=50&o=datum&ign=on".format(url_base, category) +json_feed = '{{"title": "HudebniBazar - {cat}", "items": [{items}]}}' +items = list() + +# To avoid TLSv1.2 errors. +ct = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2) +ct.set_ciphers("DEFAULT:@SECLEVEL=0") + + +def processListingDate(string_date: str): + # mins = re.search("^před (\\d+) min\\.", string_date) + + # if mins is not None: + # return datetime.datetime.now() - datetime.timedelta(minutes=int(mins.group(1))) + + # tday = re.search("^dnes v (\\d{1,2}):(\\d{2})$", string_date) + + # if tday is not None: + # return datetime.datetime.today().replace( + # hour=int(tday.group(1)), minute=int(tday.group(2)) + # ) + + # yday = re.search("^včera v (\\d{1,2}):(\\d{2})$", string_date) + + # if yday is not None: + # return (datetime.datetime.today() - datetime.timedelta(days=1)).replace( + # hour=int(yday.group(1)), minute=int(yday.group(2)) + # ) + + dy = dateparser.parse(string_date, languages=["cs"]) + return dy + + +def processListingImgs(listing: bs4.Tag): + pics = list() + for pic in listing.find("div", class_="InzeratObr").find_all("a"): + pics.append(url_base + pic.get("href")) + + return pics + + +def generateListingJson(listing): + article_price = listing.find(class_="InzeratCena").contents[0].get_text(strip=True) + article_title = listing.find(class_="InzeratNadpis").b.get_text(strip=True) + article_date = listing.find(class_="InzeratZarazeno").get_text(strip=True) + + article_parsed_date = processListingDate(article_date) + article_imgs = processListingImgs(listing) + + if article_imgs.count == 0: + article_attachments = "" + else: + article_attachments = ", ".join( + ['{{"url": {}, "mime_type": "image/jpeg"}}'.format(json.dumps(i)) for i in article_imgs] + ) + + article_url = json.dumps(url_base + listing.a["href"]) + article_fulltitle = json.dumps("[{}] {}".format(article_price, article_title)) + article_html = json.dumps(listing.find(class_="InzeratText").get_text(strip=True)) + article_author = json.dumps( + listing.find(lambda tag: tag.name == "div" and not tag.attrs).get_text( + strip=True + ) + ) + article_fulldate = json.dumps(article_parsed_date.isoformat()) + + return '{{"title": {title}, "attachments": [{att}], "authors": [{{"name": {author}}}], "date_published": {publ}, "content_html": {html}, "url": {url}}}'.format( + title=article_fulltitle, + html=article_html, + url=article_url, + author=article_author, + publ=article_fulldate, + att=article_attachments, + ) + + +php_ssid = None + +for page_number in range(1, number_of_pages + 1): + page_url = url + "&p={}".format(page_number) + page_request = urllib.request.Request(page_url) + page_request.add_header("User-Agent", "curl/8.4.0") + + if php_ssid is not None: + page_request.add_header("Cookie", "PHPSESSID={};".format(php_ssid)) + + page_response = urllib.request.urlopen(page_request, context=ct) + page_html = page_response.read().decode("utf-8") + + if php_ssid is None: + cook = http.cookies.SimpleCookie() + cook.load(page_response.getheader("Set-Cookie")) + php_ssid = cook.get("PHPSESSID").value + + soup = bs4.BeautifulSoup(page_html, "html.parser") + listings = soup.find_all("div", class_="InzeratBody") + + for listing in listings: + items.append(generateListingJson(listing)) + +json_feed = json_feed.format(cat=category, items=", ".join(items)) +print(json_feed)