diff --git a/resources/desktop/com.github.rssguard.appdata.xml b/resources/desktop/com.github.rssguard.appdata.xml index de4a88404..091df44fe 100644 --- a/resources/desktop/com.github.rssguard.appdata.xml +++ b/resources/desktop/com.github.rssguard.appdata.xml @@ -26,7 +26,7 @@ https://github.com/sponsors/martinrotter - + none diff --git a/resources/scripts/scrapers/scrape-as-rss2.py b/resources/scripts/scrapers/scrape-as-rss2.py deleted file mode 100644 index 8defd7659..000000000 --- a/resources/scripts/scrapers/scrape-as-rss2.py +++ /dev/null @@ -1,55 +0,0 @@ -# Downloads full articles for RSS 2.0 feed and replaces original articles. -# -# Make sure to have all dependencies installed: -# pip3 install asyncio (if using parallel version of the script) -# -# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: -# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4" -# -# You must provide three command line arguments: -# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS] - -import json -import re -import sys -import time -import html -import urllib.request -import distutils.util -import xml.etree.ElementTree as ET - -no_threads = int(sys.argv[1]) - -if no_threads > 1: - import asyncio - from concurrent.futures import ThreadPoolExecutor - -sys.stdin.reconfigure(encoding='utf-8') -rss_data = sys.stdin.read() -rss_document = ET.fromstring(rss_data) - -def process_article(article): - try: - link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text - response = urllib.request.urlopen(link) - text = response.read().decode("utf-8") - js = json.loads(text) - - if int(js["error"]) == 0: - article.find("description").text = js["data"]["content"] - except: - pass - -# Scrape articles. -if no_threads > 1: - with ThreadPoolExecutor(max_workers = no_threads) as executor: - futures = [] - for article in rss_document.findall(".//item"): - futures.append(executor.submit(process_article, article)) - for future in futures: - future.result() -else: - for article in rss_document.findall(".//item"): - process_article(article) - -print(ET.tostring(rss_document, encoding = "unicode")) \ No newline at end of file diff --git a/resources/scripts/scrapers/scrape-full-articles.py b/resources/scripts/scrapers/scrape-full-articles.py new file mode 100644 index 000000000..a40b69f24 --- /dev/null +++ b/resources/scripts/scrapers/scrape-full-articles.py @@ -0,0 +1,96 @@ +# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles. +# +# Make sure to have all dependencies installed: +# pip3 install asyncio (if using parallel version of the script) +# +# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl: +# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4" +# +# You must provide three command line arguments: +# scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS] + +import json +import sys +import urllib.request +import xml.etree.ElementTree as ET + +# Globals. +atom_ns = {"atom": "http://www.w3.org/2005/Atom"} +article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + + +# Methods. +def process_article(article, is_rss, is_atom): + try: + # Extract link. + scraped_article = "" + + if is_rss: + article_link = article.find("link").text + elif is_atom: + article_link = article.find("atom:link", atom_ns).attrib['href'] + + # Scrape with article-parser. + link = article_parser_url + article_link + + response = urllib.request.urlopen(link) + text = response.read().decode("utf-8") + js = json.loads(text) + + if int(js["error"]) == 0: + scraped_article = js["data"]["content"] + + # Save scraped data. + if scraped_article: + if is_rss: + article.find("description").text = scraped_article + elif is_atom: + article.find("atom:content", atom_ns).text = scraped_article + except: + pass + + +def main(): + no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1 + + if no_threads > 1: + import asyncio + from concurrent.futures import ThreadPoolExecutor + + sys.stdin.reconfigure(encoding="utf-8") + + #feed_data = urllib.request.urlopen("https://dilbert.com/feed").read() + feed_data = sys.stdin.read() + feed_document = ET.fromstring(feed_data) + + # Determine feed type. + is_rss = feed_document.tag == "rss" + is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed" + + if not is_rss and not is_atom: + sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.") + + # Extract articles. + if is_rss: + feed_articles = feed_document.findall(".//item") + elif is_atom: + feed_articles = feed_document.findall(".//atom:entry", atom_ns) + + # Scrape articles. + if no_threads > 1: + with ThreadPoolExecutor(max_workers=no_threads) as executor: + futures = [] + for article in feed_articles: + futures.append( + executor.submit(process_article, article, is_rss, is_atom)) + for future in futures: + future.result() + else: + for article in feed_articles: + process_article(article, is_rss, is_atom) + + print(ET.tostring(feed_document, encoding="unicode")) + + +if __name__ == '__main__': + main() diff --git a/resources/scripts/scrapers/scrape-rss2.py b/resources/scripts/scrapers/scrape-rss2.py deleted file mode 100644 index 63ab40eb7..000000000 --- a/resources/scripts/scrapers/scrape-rss2.py +++ /dev/null @@ -1,56 +0,0 @@ -# Downloads full articles for RSS 2.0 feed and replaces original articles. -# -# Make sure to have all dependencies installed: -# pip3 install newspaper3k -# pip3 install asyncio (if using parallel version of the script) -# -# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: -# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4" -# -# You must provide three command line arguments: -# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS] - -import json -import re -import sys -import time -import html -import requests -import distutils.util -import xml.etree.ElementTree as ET -from newspaper import Article - -no_threads = int(sys.argv[1]) - -if no_threads > 1: - import asyncio - from concurrent.futures import ThreadPoolExecutor - -sys.stdin.reconfigure(encoding='utf-8') -rss_data = sys.stdin.read() -rss_document = ET.fromstring(rss_data) - -def process_article(article): - try: - link = article.find("link").text - - f = Article(link, keep_article_html = True) - f.download() - f.parse() - article.find("description").text = f.article_html - except: - pass - -# Scrape articles. -if no_threads > 1: - with ThreadPoolExecutor(max_workers = no_threads) as executor: - futures = [] - for article in rss_document.findall(".//item"): - futures.append(executor.submit(process_article, article)) - for future in futures: - future.result() -else: - for article in rss_document.findall(".//item"): - process_article(article) - -print(ET.tostring(rss_document, encoding = "unicode")) \ No newline at end of file