unify scrape scripts

This commit is contained in:
Martin Rotter 2021-12-10 12:07:26 +01:00
parent 92aa7c8d97
commit 24db7c5371
4 changed files with 97 additions and 112 deletions

View file

@ -26,7 +26,7 @@
<url type="donation">https://github.com/sponsors/martinrotter</url>
<content_rating type="oars-1.1" />
<releases>
<release version="4.0.4" date="2021-11-29"/>
<release version="4.0.4" date="2021-12-10"/>
</releases>
<content_rating type="oars-1.0">
<content_attribute id="violence-cartoon">none</content_attribute>

View file

@ -1,55 +0,0 @@
# Downloads full articles for RSS 2.0 feed and replaces original articles.
#
# Make sure to have all dependencies installed:
# pip3 install asyncio (if using parallel version of the script)
#
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
#
# You must provide three command line arguments:
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
import json
import re
import sys
import time
import html
import urllib.request
import distutils.util
import xml.etree.ElementTree as ET
no_threads = int(sys.argv[1])
if no_threads > 1:
import asyncio
from concurrent.futures import ThreadPoolExecutor
sys.stdin.reconfigure(encoding='utf-8')
rss_data = sys.stdin.read()
rss_document = ET.fromstring(rss_data)
def process_article(article):
try:
link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
response = urllib.request.urlopen(link)
text = response.read().decode("utf-8")
js = json.loads(text)
if int(js["error"]) == 0:
article.find("description").text = js["data"]["content"]
except:
pass
# Scrape articles.
if no_threads > 1:
with ThreadPoolExecutor(max_workers = no_threads) as executor:
futures = []
for article in rss_document.findall(".//item"):
futures.append(executor.submit(process_article, article))
for future in futures:
future.result()
else:
for article in rss_document.findall(".//item"):
process_article(article)
print(ET.tostring(rss_document, encoding = "unicode"))

View file

@ -0,0 +1,96 @@
# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.
#
# Make sure to have all dependencies installed:
# pip3 install asyncio (if using parallel version of the script)
#
# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4"
#
# You must provide three command line arguments:
# scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]
import json
import sys
import urllib.request
import xml.etree.ElementTree as ET
# Globals.
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="
# Methods.
def process_article(article, is_rss, is_atom):
try:
# Extract link.
scraped_article = ""
if is_rss:
article_link = article.find("link").text
elif is_atom:
article_link = article.find("atom:link", atom_ns).attrib['href']
# Scrape with article-parser.
link = article_parser_url + article_link
response = urllib.request.urlopen(link)
text = response.read().decode("utf-8")
js = json.loads(text)
if int(js["error"]) == 0:
scraped_article = js["data"]["content"]
# Save scraped data.
if scraped_article:
if is_rss:
article.find("description").text = scraped_article
elif is_atom:
article.find("atom:content", atom_ns).text = scraped_article
except:
pass
def main():
no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1
if no_threads > 1:
import asyncio
from concurrent.futures import ThreadPoolExecutor
sys.stdin.reconfigure(encoding="utf-8")
#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
feed_data = sys.stdin.read()
feed_document = ET.fromstring(feed_data)
# Determine feed type.
is_rss = feed_document.tag == "rss"
is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"
if not is_rss and not is_atom:
sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")
# Extract articles.
if is_rss:
feed_articles = feed_document.findall(".//item")
elif is_atom:
feed_articles = feed_document.findall(".//atom:entry", atom_ns)
# Scrape articles.
if no_threads > 1:
with ThreadPoolExecutor(max_workers=no_threads) as executor:
futures = []
for article in feed_articles:
futures.append(
executor.submit(process_article, article, is_rss, is_atom))
for future in futures:
future.result()
else:
for article in feed_articles:
process_article(article, is_rss, is_atom)
print(ET.tostring(feed_document, encoding="unicode"))
if __name__ == '__main__':
main()

View file

@ -1,56 +0,0 @@
# Downloads full articles for RSS 2.0 feed and replaces original articles.
#
# Make sure to have all dependencies installed:
# pip3 install newspaper3k
# pip3 install asyncio (if using parallel version of the script)
#
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
#
# You must provide three command line arguments:
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
import json
import re
import sys
import time
import html
import requests
import distutils.util
import xml.etree.ElementTree as ET
from newspaper import Article
no_threads = int(sys.argv[1])
if no_threads > 1:
import asyncio
from concurrent.futures import ThreadPoolExecutor
sys.stdin.reconfigure(encoding='utf-8')
rss_data = sys.stdin.read()
rss_document = ET.fromstring(rss_data)
def process_article(article):
try:
link = article.find("link").text
f = Article(link, keep_article_html = True)
f.download()
f.parse()
article.find("description").text = f.article_html
except:
pass
# Scrape articles.
if no_threads > 1:
with ThreadPoolExecutor(max_workers = no_threads) as executor:
futures = []
for article in rss_document.findall(".//item"):
futures.append(executor.submit(process_article, article))
for future in futures:
future.result()
else:
for article in rss_document.findall(".//item"):
process_article(article)
print(ET.tostring(rss_document, encoding = "unicode"))