unify scrape scripts

2021-12-10 12:07:26 +01:00 · 2021-12-10 12:07:26 +01:00 · 24db7c5371
commit 24db7c5371
parent 92aa7c8d97
4 changed files with 97 additions and 112 deletions
--- a/resources/desktop/com.github.rssguard.appdata.xml
+++ b/resources/desktop/com.github.rssguard.appdata.xml
@ -26,7 +26,7 @@
  <url type="donation">https://github.com/sponsors/martinrotter</url>
  <content_rating type="oars-1.1" />
  <releases>
-    <release version="4.0.4" date="2021-11-29"/>
+    <release version="4.0.4" date="2021-12-10"/>
  </releases>
  <content_rating type="oars-1.0">
    <content_attribute id="violence-cartoon">none</content_attribute>
--- a/resources/scripts/scrapers/scrape-as-rss2.py
+++ b/resources/scripts/scrapers/scrape-as-rss2.py
@ -1,55 +0,0 @@
 # Downloads full articles for RSS 2.0 feed and replaces original articles.
 #
 # Make sure to have all dependencies installed:
 #   pip3 install asyncio (if using parallel version of the script)
 #
 # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
 #   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
 #
 # You must provide three command line arguments:
 #   scrape-rss2.py  [NUMBER-OF-PARALLEL-THREADS]
 import json
 import re
 import sys
 import time
 import html
 import urllib.request
 import distutils.util
 import xml.etree.ElementTree as ET
 no_threads = int(sys.argv[1])
 if no_threads > 1:
  import asyncio
  from concurrent.futures import ThreadPoolExecutor
 sys.stdin.reconfigure(encoding='utf-8')
 rss_data = sys.stdin.read()
 rss_document = ET.fromstring(rss_data)
 def process_article(article):
  try:
    link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
    response = urllib.request.urlopen(link)
    text =  response.read().decode("utf-8")
    js = json.loads(text)
    if int(js["error"]) == 0:
      article.find("description").text = js["data"]["content"]
  except:
    pass
 # Scrape articles.
 if no_threads > 1:
  with ThreadPoolExecutor(max_workers = no_threads) as executor:
    futures = []
    for article in rss_document.findall(".//item"):
      futures.append(executor.submit(process_article, article))
    for future in futures:
      future.result()
 else:
  for article in rss_document.findall(".//item"):
    process_article(article)
 print(ET.tostring(rss_document, encoding = "unicode"))
--- a/resources/scripts/scrapers/scrape-full-articles.py
+++ b/resources/scripts/scrapers/scrape-full-articles.py
@ -0,0 +1,96 @@
 # Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.
 #
 # Make sure to have all dependencies installed:
 #   pip3 install asyncio (if using parallel version of the script)
 #
 # You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:
 #   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4"
 #
 # You must provide three command line arguments:
 #   scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]
 import json
 import sys
 import urllib.request
 import xml.etree.ElementTree as ET
 # Globals.
 atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
 article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="
 # Methods.
 def process_article(article, is_rss, is_atom):
  try:
    # Extract link.
    scraped_article = ""
    if is_rss:
      article_link = article.find("link").text
    elif is_atom:
      article_link = article.find("atom:link", atom_ns).attrib['href']
    # Scrape with article-parser.
    link = article_parser_url + article_link
    response = urllib.request.urlopen(link)
    text = response.read().decode("utf-8")
    js = json.loads(text)
    if int(js["error"]) == 0:
      scraped_article = js["data"]["content"]
    # Save scraped data.
    if scraped_article:
      if is_rss:
        article.find("description").text = scraped_article
      elif is_atom:
        article.find("atom:content", atom_ns).text = scraped_article
  except:
    pass
 def main():
  no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1
  if no_threads > 1:
    import asyncio
    from concurrent.futures import ThreadPoolExecutor
  sys.stdin.reconfigure(encoding="utf-8")
  #feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
  feed_data = sys.stdin.read()
  feed_document = ET.fromstring(feed_data)
  # Determine feed type.
  is_rss = feed_document.tag == "rss"
  is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"
  if not is_rss and not is_atom:
    sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")
  # Extract articles.
  if is_rss:
    feed_articles = feed_document.findall(".//item")
  elif is_atom:
    feed_articles = feed_document.findall(".//atom:entry", atom_ns)
  # Scrape articles.
  if no_threads > 1:
    with ThreadPoolExecutor(max_workers=no_threads) as executor:
      futures = []
      for article in feed_articles:
        futures.append(
            executor.submit(process_article, article, is_rss, is_atom))
      for future in futures:
        future.result()
  else:
    for article in feed_articles:
      process_article(article, is_rss, is_atom)
  print(ET.tostring(feed_document, encoding="unicode"))
 if __name__ == '__main__':
  main()
--- a/resources/scripts/scrapers/scrape-rss2.py
+++ b/resources/scripts/scrapers/scrape-rss2.py
@ -1,56 +0,0 @@
 # Downloads full articles for RSS 2.0 feed and replaces original articles.
 #
 # Make sure to have all dependencies installed:
 #   pip3 install newspaper3k
 #   pip3 install asyncio (if using parallel version of the script)
 #
 # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
 #   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
 #
 # You must provide three command line arguments:
 #   scrape-rss2.py  [NUMBER-OF-PARALLEL-THREADS]
 import json
 import re
 import sys
 import time
 import html
 import requests
 import distutils.util
 import xml.etree.ElementTree as ET
 from newspaper import Article
 no_threads = int(sys.argv[1])
 if no_threads > 1:
  import asyncio
  from concurrent.futures import ThreadPoolExecutor
 sys.stdin.reconfigure(encoding='utf-8')
 rss_data = sys.stdin.read()
 rss_document = ET.fromstring(rss_data)
 def process_article(article):
  try:
    link = article.find("link").text
    f = Article(link, keep_article_html = True)
    f.download()
    f.parse()
    article.find("description").text = f.article_html
  except:
    pass
 # Scrape articles.
 if no_threads > 1:
  with ThreadPoolExecutor(max_workers = no_threads) as executor:
    futures = []
    for article in rss_document.findall(".//item"):
      futures.append(executor.submit(process_article, article))
    for future in futures:
      future.result()
 else:
  for article in rss_document.findall(".//item"):
    process_article(article)
 print(ET.tostring(rss_document, encoding = "unicode"))