diff --git a/resources/scripts/scrapers/old.reddit.py b/resources/scripts/scrapers/old.reddit.py new file mode 100644 index 000000000..2c8813f3a --- /dev/null +++ b/resources/scripts/scrapers/old.reddit.py @@ -0,0 +1,50 @@ +import json +import sys +import time +import urllib.request +import http.cookies +import bs4 + +number_of_pages = 6 +subreddit = sys.argv[1] +url_base = "https://old.reddit.com/r/{}".format(subreddit) +json_feed = '{{"title": "Old.Reddit - {subr}", "items": [{items}]}}' +items = list() + +last_item_id = None + +for page_number in range(1, number_of_pages + 1): + if page_number == 1: + page_url = url_base + else: + page_url = url_base + "?after={}".format(last_item_id) + + page_request = urllib.request.Request(page_url) + page_request.add_header("User-Agent", "curl/8.4.0") + page_response = urllib.request.urlopen(page_request) + page_html = page_response.read().decode("utf-8") + + soup = bs4.BeautifulSoup(page_html, "html.parser") + entries = soup.css.select("[class=thing],[data-fullname^=t3_]") + + for entry in entries: + title = entry.css.select_one(".title").text + html = str(entry) + url = entry.css.select_one("a[href]")["href"] + author = entry.css.select_one("a[class^=author]").text + publ = entry.css.select_one("time")["datetime"] + last_item_id = entry["data-fullname"] + + item = '{{"title": {title}, "id": {url}, "authors": [{{"name": {author}}}], "date_published": {publ}, "content_html": {html}, "url": {url}}}'.format( + title=json.dumps(title), + html=json.dumps(html), + url=json.dumps(url), + author=json.dumps(author), + publ=json.dumps(publ) + ) + items.append(item) + + time.sleep(10) + +json_feed = json_feed.format(subr=subreddit, items=", ".join(items)) +print(json_feed)