metacritic filtering script
This commit is contained in:
parent
4de3db8000
commit
5990ff2b27
2 changed files with 67 additions and 1 deletions
|
@ -26,7 +26,7 @@
|
||||||
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="4.0.4" date="2022-01-03"/>
|
<release version="4.0.4" date="2022-01-04"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
|
66
resources/scripts/scrapers/metacritic.py
Executable file
66
resources/scripts/scrapers/metacritic.py
Executable file
|
@ -0,0 +1,66 @@
|
||||||
|
# Provides filtering of entries provided via Metacritic RSS feeds.
|
||||||
|
#
|
||||||
|
# Example input feed is: https://www.metacritic.com/rss/tv
|
||||||
|
#
|
||||||
|
# This script expects raw RSS 2.0 feed data as input and can be called like
|
||||||
|
# this on cli:
|
||||||
|
# curl 'https://www.metacritic.com/rss/tv' | python ./metacritic.py "<MINIMUM-SCORE>"
|
||||||
|
#
|
||||||
|
# Replace <MINIMUM-SCORE> with minimal numerical score your articles must have or pass nothing
|
||||||
|
# to keep all articles.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import re
|
||||||
|
import ssl
|
||||||
|
|
||||||
|
def get_score_of_url(article_url):
|
||||||
|
# Download HTML of article.
|
||||||
|
req = urllib.request.Request(article_url)
|
||||||
|
req.add_header("Accept", "*/*")
|
||||||
|
req.add_header("User-Agent", "curl/7.55.1")
|
||||||
|
response = urllib.request.urlopen(req, context=ssl.SSLContext())
|
||||||
|
text = response.read().decode("utf-8")
|
||||||
|
score = int(re.search(r'metascore_w larger tvshow positive">(\d{1,2})', text).group(1))
|
||||||
|
return score
|
||||||
|
|
||||||
|
def main():
|
||||||
|
minimal_score = int(sys.argv[1]) if len(sys.argv) >= 2 else -1
|
||||||
|
|
||||||
|
# Read RSS 2.0 feed data from input.
|
||||||
|
sys.stdin.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
#req = urllib.request.Request("https://www.metacritic.com/rss/tv")
|
||||||
|
#req.add_header("Accept", "*/*")
|
||||||
|
#req.add_header("User-Agent", "curl/7.55.1")
|
||||||
|
#feed_data = urllib.request.urlopen(req, context=ssl.SSLContext()).read()
|
||||||
|
|
||||||
|
feed_data = sys.stdin.read()
|
||||||
|
feed_document = ET.fromstring(feed_data)
|
||||||
|
|
||||||
|
# Process articles one by one.
|
||||||
|
feed_channel = feed_document.find(".//channel")
|
||||||
|
feed_articles = feed_channel.findall("item")
|
||||||
|
|
||||||
|
for article in feed_articles:
|
||||||
|
try:
|
||||||
|
article_score = get_score_of_url(article.find("link").text)
|
||||||
|
except:
|
||||||
|
article_score = minimal_score - 1
|
||||||
|
pass
|
||||||
|
|
||||||
|
if article_score < minimal_score:
|
||||||
|
feed_channel.remove(article)
|
||||||
|
else:
|
||||||
|
article_title = article.find("title")
|
||||||
|
article_title.text += " - {}".format(article_score)
|
||||||
|
|
||||||
|
out_xml = ET.tostring(feed_document)
|
||||||
|
out_decoded_xml = out_xml.decode()
|
||||||
|
|
||||||
|
print(out_decoded_xml)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Add table
Reference in a new issue