euscan: fix some robots.txt issues
- disable checks for ftp - fail silently - use einfo and not eerror Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
parent
c5af0e1937
commit
9da62b211b
@ -217,7 +217,7 @@ def urlallowed(url):
|
|||||||
|
|
||||||
protocol, domain = urlparse.urlparse(url)[:2]
|
protocol, domain = urlparse.urlparse(url)[:2]
|
||||||
|
|
||||||
if 'protocol' == 'ftp':
|
if protocol == 'ftp':
|
||||||
return True
|
return True
|
||||||
|
|
||||||
baseurl = '%s://%s' % (protocol, domain)
|
baseurl = '%s://%s' % (protocol, domain)
|
||||||
@ -228,8 +228,11 @@ def urlallowed(url):
|
|||||||
else:
|
else:
|
||||||
rp = robotparser.RobotFileParser()
|
rp = robotparser.RobotFileParser()
|
||||||
rp.set_url(robotsurl)
|
rp.set_url(robotsurl)
|
||||||
|
try:
|
||||||
rp.read()
|
rp.read()
|
||||||
rpcache[baseurl] = rp
|
rpcache[baseurl] = rp
|
||||||
|
except:
|
||||||
|
return True
|
||||||
return rp.can_fetch(CONFIG['user-agent'], url)
|
return rp.can_fetch(CONFIG['user-agent'], url)
|
||||||
|
|
||||||
def urlopen(url, timeout=None, verb="GET"):
|
def urlopen(url, timeout=None, verb="GET"):
|
||||||
@ -253,7 +256,7 @@ def tryurl(fileurl, template):
|
|||||||
result = True
|
result = True
|
||||||
|
|
||||||
if not urlallowed(fileurl):
|
if not urlallowed(fileurl):
|
||||||
output.eerror("Url '%s' blocked by robots.txt" % fileurl)
|
output.einfo("Url '%s' blocked by robots.txt" % fileurl)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
output.ebegin("Trying: " + fileurl)
|
output.ebegin("Trying: " + fileurl)
|
||||||
|
Loading…
Reference in New Issue
Block a user