better in the news
This commit is contained in:
		
							parent
							
								
									5100b5195a
								
							
						
					
					
						commit
						2c1bc5a937
					
				
					 1 changed files with 10 additions and 11 deletions
				
			
		|  | @ -3,7 +3,7 @@ | ||||||
| import urllib.request | import urllib.request | ||||||
| import re | import re | ||||||
| import json | import json | ||||||
| from html.parser import HTMLParser | from newspaper import Article | ||||||
| 
 | 
 | ||||||
| url = "https://en.wikipedia.org/wiki/Main_Page" | url = "https://en.wikipedia.org/wiki/Main_Page" | ||||||
| response = urllib.request.urlopen(url) | response = urllib.request.urlopen(url) | ||||||
|  | @ -15,21 +15,20 @@ articles_li = re.findall("<li>([\S\n\t\v ]+?)<\/li>", text_li) | ||||||
| # Iterate all articles and generate JSON feed entries. | # Iterate all articles and generate JSON feed entries. | ||||||
| wiki_base_url = "https://en.wikipedia.org" | wiki_base_url = "https://en.wikipedia.org" | ||||||
| 
 | 
 | ||||||
| class HTMLFilter(HTMLParser): |  | ||||||
|   text = "" |  | ||||||
|   def handle_data(self, data): |  | ||||||
|       self.text += data |  | ||||||
| 
 | 
 | ||||||
| json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}" | json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}" | ||||||
| items = list() | items = list() | ||||||
| 
 | 
 | ||||||
| for article in articles_li: | for article in articles_li: | ||||||
|   article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)) |   article_url = wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1) | ||||||
|   f = HTMLFilter() | 
 | ||||||
|   f.feed(article) |   f = Article(article_url, keep_article_html = True) | ||||||
|   f.text |   f.download() | ||||||
|   article_title = json.dumps(f.text) |   f.parse() | ||||||
|   article_html = json.dumps("<div>{}</div>".format(article)) | 
 | ||||||
|  |   article_url = json.dumps(article_url) | ||||||
|  |   article_title = json.dumps(f.title) | ||||||
|  |   article_html = json.dumps(f.article_html) | ||||||
|   items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title, |   items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title, | ||||||
|                                                                                          html=article_html, |                                                                                          html=article_html, | ||||||
|                                                                                          url=article_url)) |                                                                                          url=article_url)) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue