diff --git a/resources/scripts/scrapers/wiki-inthenews.py b/resources/scripts/scrapers/wiki-inthenews.py
index d95847f54..d7988c1ae 100755
--- a/resources/scripts/scrapers/wiki-inthenews.py
+++ b/resources/scripts/scrapers/wiki-inthenews.py
@@ -3,7 +3,7 @@
import urllib.request
import re
import json
-from html.parser import HTMLParser
+from newspaper import Article
url = "https://en.wikipedia.org/wiki/Main_Page"
response = urllib.request.urlopen(url)
@@ -15,21 +15,20 @@ articles_li = re.findall("
([\S\n\t\v ]+?)<\/li>", text_li)
# Iterate all articles and generate JSON feed entries.
wiki_base_url = "https://en.wikipedia.org"
-class HTMLFilter(HTMLParser):
- text = ""
- def handle_data(self, data):
- self.text += data
json_feed = "{{\"title\": \"Wikipedia - In the news\", \"items\": [{items}]}}"
items = list()
for article in articles_li:
- article_url = json.dumps(wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1))
- f = HTMLFilter()
- f.feed(article)
- f.text
- article_title = json.dumps(f.text)
- article_html = json.dumps("{}
".format(article))
+ article_url = wiki_base_url + re.search("^.+?href=\"(.+?)\"", article).group(1)
+
+ f = Article(article_url, keep_article_html = True)
+ f.download()
+ f.parse()
+
+ article_url = json.dumps(article_url)
+ article_title = json.dumps(f.title)
+ article_html = json.dumps(f.article_html)
items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}}}".format(title=article_title,
html=article_html,
url=article_url))