enahnced script
This commit is contained in:
		
							parent
							
								
									530b46a882
								
							
						
					
					
						commit
						6f9fbc8d81
					
				
					 1 changed files with 34 additions and 8 deletions
				
			
		|  | @ -2,29 +2,47 @@ | ||||||
| # | # | ||||||
| # Make sure to have all dependencies installed: | # Make sure to have all dependencies installed: | ||||||
| #   pip3 install googletrans==4.0.0-rc1 | #   pip3 install googletrans==4.0.0-rc1 | ||||||
|  | #   pip3 install asyncio (if using parallel version of the script) | ||||||
| # | # | ||||||
| # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: | # You must provide raw RSS 2.0 UTF-8 feed XML data as input, for example with curl: | ||||||
| #   curl 'https://phys.org/rss-feed/' | python ./translate-rss2.py "en" "pt_BR" | #   curl 'https://phys.org/rss-feed/' | python ./translate-rss2.py "en" "pt_BR" "true" | ||||||
| # | # | ||||||
| # You must provide two additional command line arguments: | # You must provide three command line arguments: | ||||||
| #   translate-rss2.py [FROM-LANGUAGE] [TO-LANGUAGE] | #   translate-rss2.py [FROM-LANGUAGE] [TO-LANGUAGE] [RUN-PARALLEL] | ||||||
| 
 | 
 | ||||||
|  | import json | ||||||
|  | import re | ||||||
| import sys | import sys | ||||||
| import time | import time | ||||||
|  | import html | ||||||
|  | import requests | ||||||
|  | import distutils.util | ||||||
| import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||||
| from googletrans import Translator | from googletrans import Translator | ||||||
| 
 | 
 | ||||||
| lang_from = sys.argv[1] | lang_from = sys.argv[1] | ||||||
| lang_to = sys.argv[2] | lang_to = sys.argv[2] | ||||||
|  | parallel = bool(distutils.util.strtobool(sys.argv[3])) | ||||||
|  | 
 | ||||||
|  | if parallel: | ||||||
|  |   import asyncio | ||||||
|  |   from concurrent.futures import ThreadPoolExecutor | ||||||
|  | 
 | ||||||
| sys.stdin.reconfigure(encoding='utf-8') | sys.stdin.reconfigure(encoding='utf-8') | ||||||
| rss_data = sys.stdin.read() | rss_data = sys.stdin.read() | ||||||
| rss_document = ET.fromstring(rss_data) | rss_document = ET.fromstring(rss_data) | ||||||
| translator = Translator() | translator = Translator() | ||||||
| 
 | 
 | ||||||
| def translate_string(to_translate): | def translate_string(to_translate): | ||||||
|  |   try: | ||||||
|     translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to) |     translated_text = translator.translate(to_translate, src = lang_from, dest = lang_to) | ||||||
|  | 
 | ||||||
|  |     if not parallel: | ||||||
|       time.sleep(0.2) |       time.sleep(0.2) | ||||||
|  | 
 | ||||||
|     return translated_text.text |     return translated_text.text | ||||||
|  |   except: | ||||||
|  |     return to_translate | ||||||
| 
 | 
 | ||||||
| def process_article(article): | def process_article(article): | ||||||
|   title = article.find("title") |   title = article.find("title") | ||||||
|  | @ -33,6 +51,14 @@ def process_article(article): | ||||||
|   contents = article.find("description") |   contents = article.find("description") | ||||||
|   contents.text = translate_string(" ".join(contents.itertext())) |   contents.text = translate_string(" ".join(contents.itertext())) | ||||||
| 
 | 
 | ||||||
|  | if parallel: | ||||||
|  |   with ThreadPoolExecutor(max_workers = 2) as executor: | ||||||
|  |     futures = [] | ||||||
|  |     for article in rss_document.findall(".//item"): | ||||||
|  |       futures.append(executor.submit(process_article, article)) | ||||||
|  |     for future in futures: | ||||||
|  |       future.result() | ||||||
|  | else: | ||||||
|   for article in rss_document.findall(".//item"): |   for article in rss_document.findall(".//item"): | ||||||
|     process_article(article) |     process_article(article) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue