55 lines
		
	
	
		
			No EOL
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			55 lines
		
	
	
		
			No EOL
		
	
	
		
			1.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Downloads full articles for RSS 2.0 feed and replaces original articles.
 | |
| #
 | |
| # Make sure to have all dependencies installed:
 | |
| #   pip3 install asyncio (if using parallel version of the script)
 | |
| #
 | |
| # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl:
 | |
| #   curl 'http://rss.cnn.com/rss/edition.rss' | python ./scrape-rss2.py "4"
 | |
| #
 | |
| # You must provide three command line arguments:
 | |
| #   scrape-rss2.py  [NUMBER-OF-PARALLEL-THREADS]
 | |
| 
 | |
| import json
 | |
| import re
 | |
| import sys
 | |
| import time
 | |
| import html
 | |
| import urllib.request
 | |
| import distutils.util
 | |
| import xml.etree.ElementTree as ET
 | |
| 
 | |
| no_threads = int(sys.argv[1])
 | |
| 
 | |
| if no_threads > 1:
 | |
|   import asyncio
 | |
|   from concurrent.futures import ThreadPoolExecutor
 | |
| 
 | |
| sys.stdin.reconfigure(encoding='utf-8')
 | |
| rss_data = sys.stdin.read()
 | |
| rss_document = ET.fromstring(rss_data)
 | |
| 
 | |
| def process_article(article):
 | |
|   try:
 | |
|     link = "https://us-central1-technews-251304.cloudfunctions.net/article-parser?url=" + article.find("link").text
 | |
|     response = urllib.request.urlopen(link)
 | |
|     text =  response.read().decode("utf-8")
 | |
|     js = json.loads(text)
 | |
| 
 | |
|     if int(js["error"]) == 0:
 | |
|       article.find("description").text = js["data"]["content"]
 | |
|   except:
 | |
|     pass
 | |
| 
 | |
| # Scrape articles.
 | |
| if no_threads > 1:
 | |
|   with ThreadPoolExecutor(max_workers = no_threads) as executor:
 | |
|     futures = []
 | |
|     for article in rss_document.findall(".//item"):
 | |
|       futures.append(executor.submit(process_article, article))
 | |
|     for future in futures:
 | |
|       future.result()
 | |
| else:
 | |
|   for article in rss_document.findall(".//item"):
 | |
|     process_article(article)
 | |
| 
 | |
| print(ET.tostring(rss_document, encoding = "unicode")) |