unify scrape scripts
This commit is contained in:
parent
92aa7c8d97
commit
24db7c5371
4 changed files with 97 additions and 112 deletions
|
@ -26,7 +26,7 @@
|
||||||
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
<url type="donation">https://github.com/sponsors/martinrotter</url>
|
||||||
<content_rating type="oars-1.1" />
|
<content_rating type="oars-1.1" />
|
||||||
<releases>
|
<releases>
|
||||||
<release version="4.0.4" date="2021-11-29"/>
|
<release version="4.0.4" date="2021-12-10"/>
|
||||||
</releases>
|
</releases>
|
||||||
<content_rating type="oars-1.0">
|
<content_rating type="oars-1.0">
|
||||||
<content_attribute id="violence-cartoon">none</content_attribute>
|
<content_attribute id="violence-cartoon">none</content_attribute>
|
||||||
|
|
|
@ -1,55 +0,0 @@
|
||||||
# Downloads full articles for RSS 2.0 feed and replaces original articles.
|
|
||||||
#
|
|
||||||
# Make sure to have all dependencies installed:
|
|
||||||
# pip3 install asyncio (if using parallel version of the script)
|
|
||||||
#
|
|
||||||
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
|
||||||
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
|
|
||||||
#
|
|
||||||
# You must provide three command line arguments:
|
|
||||||
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
|
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import html
|
|
||||||
import urllib.request
|
|
||||||
import distutils.util
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
|
|
||||||
no_threads = int(sys.argv[1])
|
|
||||||
|
|
||||||
if no_threads > 1:
|
|
||||||
import asyncio
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
|
|
||||||
sys.stdin.reconfigure(encoding='utf-8')
|
|
||||||
rss_data = sys.stdin.read()
|
|
||||||
rss_document = ET.fromstring(rss_data)
|
|
||||||
|
|
||||||
def process_article(article):
|
|
||||||
try:
|
|
||||||
link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
|
|
||||||
response = urllib.request.urlopen(link)
|
|
||||||
text = response.read().decode("utf-8")
|
|
||||||
js = json.loads(text)
|
|
||||||
|
|
||||||
if int(js["error"]) == 0:
|
|
||||||
article.find("description").text = js["data"]["content"]
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Scrape articles.
|
|
||||||
if no_threads > 1:
|
|
||||||
with ThreadPoolExecutor(max_workers = no_threads) as executor:
|
|
||||||
futures = []
|
|
||||||
for article in rss_document.findall(".//item"):
|
|
||||||
futures.append(executor.submit(process_article, article))
|
|
||||||
for future in futures:
|
|
||||||
future.result()
|
|
||||||
else:
|
|
||||||
for article in rss_document.findall(".//item"):
|
|
||||||
process_article(article)
|
|
||||||
|
|
||||||
print(ET.tostring(rss_document, encoding = "unicode"))
|
|
96
resources/scripts/scrapers/scrape-full-articles.py
Normal file
96
resources/scripts/scrapers/scrape-full-articles.py
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
# Downloads full (HTML) articles for ATOM or RSS 2.0 feed and replaces original articles.
|
||||||
|
#
|
||||||
|
# Make sure to have all dependencies installed:
|
||||||
|
# pip3 install asyncio (if using parallel version of the script)
|
||||||
|
#
|
||||||
|
# You must provide raw ATOM or RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
||||||
|
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-full-articles.py "4"
|
||||||
|
#
|
||||||
|
# You must provide three command line arguments:
|
||||||
|
# scrape-full-articles.py [NUMBER-OF-PARALLEL-THREADS]
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
# Globals.
|
||||||
|
atom_ns = {"atom": "http://www.w3.org/2005/Atom"}
|
||||||
|
article_parser_url = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url="
|
||||||
|
|
||||||
|
|
||||||
|
# Methods.
|
||||||
|
def process_article(article, is_rss, is_atom):
|
||||||
|
try:
|
||||||
|
# Extract link.
|
||||||
|
scraped_article = ""
|
||||||
|
|
||||||
|
if is_rss:
|
||||||
|
article_link = article.find("link").text
|
||||||
|
elif is_atom:
|
||||||
|
article_link = article.find("atom:link", atom_ns).attrib['href']
|
||||||
|
|
||||||
|
# Scrape with article-parser.
|
||||||
|
link = article_parser_url + article_link
|
||||||
|
|
||||||
|
response = urllib.request.urlopen(link)
|
||||||
|
text = response.read().decode("utf-8")
|
||||||
|
js = json.loads(text)
|
||||||
|
|
||||||
|
if int(js["error"]) == 0:
|
||||||
|
scraped_article = js["data"]["content"]
|
||||||
|
|
||||||
|
# Save scraped data.
|
||||||
|
if scraped_article:
|
||||||
|
if is_rss:
|
||||||
|
article.find("description").text = scraped_article
|
||||||
|
elif is_atom:
|
||||||
|
article.find("atom:content", atom_ns).text = scraped_article
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
no_threads = int(sys.argv[1]) if len(sys.argv) >= 2 else 1
|
||||||
|
|
||||||
|
if no_threads > 1:
|
||||||
|
import asyncio
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
sys.stdin.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
#feed_data = urllib.request.urlopen("https://dilbert.com/feed").read()
|
||||||
|
feed_data = sys.stdin.read()
|
||||||
|
feed_document = ET.fromstring(feed_data)
|
||||||
|
|
||||||
|
# Determine feed type.
|
||||||
|
is_rss = feed_document.tag == "rss"
|
||||||
|
is_atom = feed_document.tag == "{http://www.w3.org/2005/Atom}feed"
|
||||||
|
|
||||||
|
if not is_rss and not is_atom:
|
||||||
|
sys.exit("Passed file is neither ATOM nor RSS 2.0 feed.")
|
||||||
|
|
||||||
|
# Extract articles.
|
||||||
|
if is_rss:
|
||||||
|
feed_articles = feed_document.findall(".//item")
|
||||||
|
elif is_atom:
|
||||||
|
feed_articles = feed_document.findall(".//atom:entry", atom_ns)
|
||||||
|
|
||||||
|
# Scrape articles.
|
||||||
|
if no_threads > 1:
|
||||||
|
with ThreadPoolExecutor(max_workers=no_threads) as executor:
|
||||||
|
futures = []
|
||||||
|
for article in feed_articles:
|
||||||
|
futures.append(
|
||||||
|
executor.submit(process_article, article, is_rss, is_atom))
|
||||||
|
for future in futures:
|
||||||
|
future.result()
|
||||||
|
else:
|
||||||
|
for article in feed_articles:
|
||||||
|
process_article(article, is_rss, is_atom)
|
||||||
|
|
||||||
|
print(ET.tostring(feed_document, encoding="unicode"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
|
@ -1,56 +0,0 @@
|
||||||
# Downloads full articles for RSS 2.0 feed and replaces original articles.
|
|
||||||
#
|
|
||||||
# Make sure to have all dependencies installed:
|
|
||||||
# pip3 install newspaper3k
|
|
||||||
# pip3 install asyncio (if using parallel version of the script)
|
|
||||||
#
|
|
||||||
# You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
|
|
||||||
# curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
|
|
||||||
#
|
|
||||||
# You must provide three command line arguments:
|
|
||||||
# scrape-rss2.py [NUMBER-OF-PARALLEL-THREADS]
|
|
||||||
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import html
|
|
||||||
import requests
|
|
||||||
import distutils.util
|
|
||||||
import xml.etree.ElementTree as ET
|
|
||||||
from newspaper import Article
|
|
||||||
|
|
||||||
no_threads = int(sys.argv[1])
|
|
||||||
|
|
||||||
if no_threads > 1:
|
|
||||||
import asyncio
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
|
|
||||||
sys.stdin.reconfigure(encoding='utf-8')
|
|
||||||
rss_data = sys.stdin.read()
|
|
||||||
rss_document = ET.fromstring(rss_data)
|
|
||||||
|
|
||||||
def process_article(article):
|
|
||||||
try:
|
|
||||||
link = article.find("link").text
|
|
||||||
|
|
||||||
f = Article(link, keep_article_html = True)
|
|
||||||
f.download()
|
|
||||||
f.parse()
|
|
||||||
article.find("description").text = f.article_html
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Scrape articles.
|
|
||||||
if no_threads > 1:
|
|
||||||
with ThreadPoolExecutor(max_workers = no_threads) as executor:
|
|
||||||
futures = []
|
|
||||||
for article in rss_document.findall(".//item"):
|
|
||||||
futures.append(executor.submit(process_article, article))
|
|
||||||
for future in futures:
|
|
||||||
future.result()
|
|
||||||
else:
|
|
||||||
for article in rss_document.findall(".//item"):
|
|
||||||
process_article(article)
|
|
||||||
|
|
||||||
print(ET.tostring(rss_document, encoding = "unicode"))
|
|
Loading…
Add table
Reference in a new issue