unify scrape scripts
This commit is contained in:
parent
92aa7c8d97
commit
24db7c5371
4 changed files with 97 additions and 112 deletions
|
@ -26,7 +26,7 @@
|
|||
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
||||
<content_rating type="oars-1.1" />
|
||||
<releases>
|
||||
<release version="4.0.4" date="2021-11-29"/>
|
||||
<release version="4.0.4" date="2021-12-10"/>
|
||||
</releases>
|
||||
<content_rating type="oars-1.0">
|
||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||
|
|
|
@ -1,55 +0,0 @@
|
|||
# Downloads full articles for RSS 2.0 feed and replaces original articles.
|
||||
#
|
||||
# Make sure to have all dependencies installed:
|
||||
# pip3 install asyncio (if using parallel version of the script)
|
||||
#
|
||||
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
||||
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
|
||||
#
|
||||
# You must provide three command line arguments:
|
||||
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import html
|
||||
import urllib.request
|
||||
import distutils.util
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
no_threads = int(sys.argv[1])
|
||||
|
||||
if no_threads > 1:
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
sys.stdin.reconfigure(encoding='utf-8')
|
||||
rss_data = sys.stdin.read()
|
||||
rss_document = ET.fromstring(rss_data)
|
||||
|
||||
def process_article(article):
|
||||
try:
|
||||
link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
|
||||
response = urllib.request.urlopen(link)
|
||||
text = response.read().decode("utf-8")
|
||||
js = json.loads(text)
|
||||
|
||||
if int(js["error"]) == 0:
|
||||
article.find("description").text = js["data"]["content"]
|
||||
except:
|
||||
pass
|
||||
|
||||
# Scrape articles.
|
||||
if no_threads > 1:
|
||||
with ThreadPoolExecutor(max_workers = no_threads) as executor:
|
||||
futures = []
|
||||
for article in rss_document.findall(".//item"):
|
||||
futures.append(executor.submit(process_article, article))
|
||||
for future in futures:
|
||||
future.result()
|
||||
else:
|
||||
for article in rss_document.findall(".//item"):
|
||||
process_article(article)
|
||||
|
||||
print(ET.tostring(rss_document, encoding = "unicode"))
|
96
resources/scripts/scrapers/scrape-full-articles.py
Normal file
96
resources/scripts/scrapers/scrape-full-articles.py
Normal file
|
@ -0,0 +1,96 @@
|
|||
# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.
|
||||
#
|
||||
# Make sure to have all dependencies installed:
|
||||
# pip3 install asyncio (if using parallel version of the script)
|
||||
#
|
||||
# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
||||
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4"
|
||||
#
|
||||
# You must provide three command line arguments:
|
||||
# scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]
|
||||
|
||||
import json
|
||||
import sys
|
||||
import urllib.request
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
# Globals.
|
||||
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
|
||||
article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="
|
||||
|
||||
|
||||
# Methods.
|
||||
def process_article(article, is_rss, is_atom):
|
||||
try:
|
||||
# Extract link.
|
||||
scraped_article = ""
|
||||
|
||||
if is_rss:
|
||||
article_link = article.find("link").text
|
||||
elif is_atom:
|
||||
article_link = article.find("atom:link", atom_ns).attrib['href']
|
||||
|
||||
# Scrape with article-parser.
|
||||
link = article_parser_url + article_link
|
||||
|
||||
response = urllib.request.urlopen(link)
|
||||
text = response.read().decode("utf-8")
|
||||
js = json.loads(text)
|
||||
|
||||
if int(js["error"]) == 0:
|
||||
scraped_article = js["data"]["content"]
|
||||
|
||||
# Save scraped data.
|
||||
if scraped_article:
|
||||
if is_rss:
|
||||
article.find("description").text = scraped_article
|
||||
elif is_atom:
|
||||
article.find("atom:content", atom_ns).text = scraped_article
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def main():
|
||||
no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1
|
||||
|
||||
if no_threads > 1:
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
sys.stdin.reconfigure(encoding="utf-8")
|
||||
|
||||
#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
|
||||
feed_data = sys.stdin.read()
|
||||
feed_document = ET.fromstring(feed_data)
|
||||
|
||||
# Determine feed type.
|
||||
is_rss = feed_document.tag == "rss"
|
||||
is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"
|
||||
|
||||
if not is_rss and not is_atom:
|
||||
sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")
|
||||
|
||||
# Extract articles.
|
||||
if is_rss:
|
||||
feed_articles = feed_document.findall(".//item")
|
||||
elif is_atom:
|
||||
feed_articles = feed_document.findall(".//atom:entry", atom_ns)
|
||||
|
||||
# Scrape articles.
|
||||
if no_threads > 1:
|
||||
with ThreadPoolExecutor(max_workers=no_threads) as executor:
|
||||
futures = []
|
||||
for article in feed_articles:
|
||||
futures.append(
|
||||
executor.submit(process_article, article, is_rss, is_atom))
|
||||
for future in futures:
|
||||
future.result()
|
||||
else:
|
||||
for article in feed_articles:
|
||||
process_article(article, is_rss, is_atom)
|
||||
|
||||
print(ET.tostring(feed_document, encoding="unicode"))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -1,56 +0,0 @@
|
|||
# Downloads full articles for RSS 2.0 feed and replaces original articles.
|
||||
#
|
||||
# Make sure to have all dependencies installed:
|
||||
# pip3 install newspaper3k
|
||||
# pip3 install asyncio (if using parallel version of the script)
|
||||
#
|
||||
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
||||
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
|
||||
#
|
||||
# You must provide three command line arguments:
|
||||
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import html
|
||||
import requests
|
||||
import distutils.util
|
||||
import xml.etree.ElementTree as ET
|
||||
from newspaper import Article
|
||||
|
||||
no_threads = int(sys.argv[1])
|
||||
|
||||
if no_threads > 1:
|
||||
import asyncio
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
sys.stdin.reconfigure(encoding='utf-8')
|
||||
rss_data = sys.stdin.read()
|
||||
rss_document = ET.fromstring(rss_data)
|
||||
|
||||
def process_article(article):
|
||||
try:
|
||||
link = article.find("link").text
|
||||
|
||||
f = Article(link, keep_article_html = True)
|
||||
f.download()
|
||||
f.parse()
|
||||
article.find("description").text = f.article_html
|
||||
except:
|
||||
pass
|
||||
|
||||
# Scrape articles.
|
||||
if no_threads > 1:
|
||||
with ThreadPoolExecutor(max_workers = no_threads) as executor:
|
||||
futures = []
|
||||
for article in rss_document.findall(".//item"):
|
||||
futures.append(executor.submit(process_article, article))
|
||||
for future in futures:
|
||||
future.result()
|
||||
else:
|
||||
for article in rss_document.findall(".//item"):
|
||||
process_article(article)
|
||||
|
||||
print(ET.tostring(rss_document, encoding = "unicode"))
|
Loading…
Add table
Reference in a new issue