general script for scraping articles

2021-04-19 07:27:12 +02:00 · 2021-04-19 07:27:12 +02:00 · 93d673ef74
commit 93d673ef74
parent 3b0122be63
3 changed files with 58 additions and 2 deletions
--- a/resources/desktop/com.github.rssguard.appdata.xml
+++ b/resources/desktop/com.github.rssguard.appdata.xml
@ -30,7 +30,7 @@
  <url type="donation">https://martinrotter.github.io/donate/</url>
  <content_rating type="oars-1.1" />
  <releases>
-    <release version="3.9.0" date="2021-04-16"/>
+    <release version="3.9.0" date="2021-04-19"/>
  </releases>
  <content_rating type="oars-1.0">
    <content_attribute id="violence-cartoon">none</content_attribute>
--- a/resources/scripts/7za
+++ b/resources/scripts/7za
@ -1 +1 @@
-Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749
+Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9
--- a/resources/scripts/scrapers/scrape-rss2.py
+++ b/resources/scripts/scrapers/scrape-rss2.py
@ -0,0 +1,56 @@
 # Downloads full articles for RSS 2.0 feed and replaces original articles.
 #
 # Make sure to have all dependencies installed:
 #   pip3 install newspaper3k
 #   pip3 install asyncio (if using parallel version of the script)
 #
 # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
 #   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
 #
 # You must provide three command line arguments:
 #   scrape-rss2.py  [NUMBER-OF-PARALLEL-THREADS]
 import json
 import re
 import sys
 import time
 import html
 import requests
 import distutils.util
 import xml.etree.ElementTree as ET
 from newspaper import Article
 no_threads = int(sys.argv[1])
 if no_threads > 1:
  import asyncio
  from concurrent.futures import ThreadPoolExecutor
 sys.stdin.reconfigure(encoding='utf-8')
 rss_data = sys.stdin.read()
 rss_document = ET.fromstring(rss_data)
 def process_article(article):
  try:
    link = article.find("link").text
    f = Article(link, keep_article_html = True)
    f.download()
    f.parse()
    article.find("description").text = f.article_html
  except:
    pass
 # Scrape articles.
 if no_threads > 1:
  with ThreadPoolExecutor(max_workers = no_threads) as executor:
    futures = []
    for article in rss_document.findall(".//item"):
      futures.append(executor.submit(process_article, article))
    for future in futures:
      future.result()
 else:
  for article in rss_document.findall(".//item"):
    process_article(article)
 print(ET.tostring(rss_document, encoding = "unicode"))
		`@ -1 +1 @@`
			`Subproject commit 9c10723bfbaf6cb85107d6ee16e0324e9e487749`				`Subproject commit 47f4125753452eff8800dbd6600c5a05540b15d9`