euscan: fix some robots.txt issues

- disable checks for ftp
- fail silently
- use einfo and not eerror

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary 2011-09-10 08:23:46 +02:00
parent c5af0e1937
commit 9da62b211b

View File

@ -217,7 +217,7 @@ def urlallowed(url):
protocol, domain = urlparse.urlparse(url)[:2] protocol, domain = urlparse.urlparse(url)[:2]
if 'protocol' == 'ftp': if protocol == 'ftp':
return True return True
baseurl = '%s://%s' % (protocol, domain) baseurl = '%s://%s' % (protocol, domain)
@ -228,8 +228,11 @@ def urlallowed(url):
else: else:
rp = robotparser.RobotFileParser() rp = robotparser.RobotFileParser()
rp.set_url(robotsurl) rp.set_url(robotsurl)
try:
rp.read() rp.read()
rpcache[baseurl] = rp rpcache[baseurl] = rp
except:
return True
return rp.can_fetch(CONFIG['user-agent'], url) return rp.can_fetch(CONFIG['user-agent'], url)
def urlopen(url, timeout=None, verb="GET"): def urlopen(url, timeout=None, verb="GET"):
@ -253,7 +256,7 @@ def tryurl(fileurl, template):
result = True result = True
if not urlallowed(fileurl): if not urlallowed(fileurl):
output.eerror("Url '%s' blocked by robots.txt" % fileurl) output.einfo("Url '%s' blocked by robots.txt" % fileurl)
return None return None
output.ebegin("Trying: " + fileurl) output.ebegin("Trying: " + fileurl)