39 lines
		
	
	
		
			No EOL
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			39 lines
		
	
	
		
			No EOL
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
| # Obtains Wikipedia's "In the news" today's articles.
 | |
| 
 | |
| import urllib.request
 | |
| import re
 | |
| import json
 | |
| from html.parser import HTMLParser
 | |
| 
 | |
| url = "https://en.wikipedia.org/wiki/Main_Page"
 | |
| response = urllib.request.urlopen(url)
 | |
| text =  response.read().decode("utf-8")
 | |
| 
 | |
| text_li = re.search("In the news[\S\n\t\v ]+?<ul>([\S\n\t\v ]+?)<\/ul>", text).group(1)
 | |
| articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li)
 | |
| 
 | |
| # Iterate all articles and generate JSON feed entries.
 | |
| wiki_base_url = "https://en.wikipedia.org"
 | |
| 
 | |
| class HTMLFilter(HTMLParser):
 | |
|   text = ""
 | |
|   def handle_data(self, data):
 | |
|       self.text += data
 | |
| 
 | |
| json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
 | |
| items = list()
 | |
| 
 | |
| for article in articles_li:
 | |
|   article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
 | |
|   f = HTMLFilter()
 | |
|   f.feed(article)
 | |
|   f.text
 | |
|   article_title = json.dumps(f.text)
 | |
|   article_html = json.dumps("<div>{}</div>".format(article))
 | |
|   items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
 | |
|                                                                                          html=article_html,
 | |
|                                                                                          url=article_url))
 | |
| 
 | |
| json_feed = json_feed.format(items=", ".join(items))
 | |
| 
 | |
| print(json_feed) |