scraper for searching of feeds in html sites
This commit is contained in:
parent
24df33dd89
commit
a96e83fb10
1 changed files with 6 additions and 3 deletions
|
@ -16,9 +16,12 @@ regexp_href = re.compile("href=\"([^\"]+)\"")
|
||||||
|
|
||||||
for url in urls_lines:
|
for url in urls_lines:
|
||||||
# Download HTML data.
|
# Download HTML data.
|
||||||
url_response = urllib.request.urlopen(url)
|
try:
|
||||||
html = url_response.read().decode("utf-8")
|
url_response = urllib.request.urlopen(url)
|
||||||
|
html = url_response.read().decode("utf-8")
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
|
||||||
# Search for XML feeds with regexps.
|
# Search for XML feeds with regexps.
|
||||||
for link_tag in re.findall(regexp_link, html):
|
for link_tag in re.findall(regexp_link, html):
|
||||||
for link_xml_feed in re.findall(regexp_href, link_tag):
|
for link_xml_feed in re.findall(regexp_href, link_tag):
|
||||||
|
|
Loading…
Add table
Reference in a new issue