33 lines
		
	
	
		
			No EOL
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			33 lines
		
	
	
		
			No EOL
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # Obtains producthunt's articles.
 | |
| # Sample input file whose contents must be provided as stdin: "https://www.producthunt.com/topics/XXXX"
 | |
| 
 | |
| import json
 | |
| import re
 | |
| import sys
 | |
| import urllib.request
 | |
| from html.parser import HTMLParser
 | |
| from bs4 import BeautifulSoup
 | |
| from datetime import datetime
 | |
| 
 | |
| input_data = sys.stdin.read()
 | |
| soup = BeautifulSoup(input_data, 'html.parser')
 | |
| 
 | |
| json_feed = "{{\"title\": {title}, \"items\": [{items}]}}"
 | |
| items = list()
 | |
| 
 | |
| regex_batch = re.compile('^styles_postContent__.+$')
 | |
| for post_content in soup.find_all("div", {"class" : regex_batch}):
 | |
|   regex_single = re.compile('^styles_content__.+$')
 | |
|   for content in post_content.find_all("div", {"class" : regex_single}):
 | |
|     article_url = json.dumps("https://www.producthunt.com" + content.p.a["href"])
 | |
|     article_title = json.dumps(content.h3.text)
 | |
|     article_contents = json.dumps(content.p.text)
 | |
|     article_time = json.dumps(datetime.utcnow().isoformat())
 | |
|     items.append("{{\"title\": {title}, \"content_html\": {html}, \"url\": {url}, \"date_published\": {date}}}".format(
 | |
|       title=article_title,
 | |
|       html=article_contents,
 | |
|       url=article_url,
 | |
|       date=article_time))
 | |
| 
 | |
| json_feed = json_feed.format(title = json.dumps(soup.title.text), items = ", ".join(items))
 | |
| print(json_feed) |