euscan: robots.txt, timeout, user-agent, ...

- Add a blacklist for robots.txt, we *want* to scan sourceforge
- Set a user-agent that doesn't looks like a browser
- Handle timeouts more carefully
- If brute force detect too much versions, avoid infinite loops
- Handle redirections more carefully

Signed-off-by: Corentin Chary <corentincj@iksaif.net>
This commit is contained in:
Corentin Chary
2011-09-21 10:09:50 +02:00
parent 8c40a1795c
commit 14971584af
6 changed files with 95 additions and 17 deletions

View File

@ -12,6 +12,7 @@ __email__ = "corentin.chary@gmail.com"
__version__ = "git"
__productname__ = "euscan"
__description__ = "A tool to detect new upstream releases."
__version__ = "git"
""" Imports """
@ -19,6 +20,7 @@ import os
import sys
import getopt
import errno
import httplib
from portage.output import white, yellow, turquoise, green, EOutput
@ -134,13 +136,13 @@ def parseArgs():
pp.output.nocolor()
elif o in ("-q", "--quiet"):
CONFIG['quiet'] = True
CONFIG['verbose'] = False
CONFIG['verbose'] = 0
elif o in ("-1", "--oneshot"):
CONFIG['oneshot'] = True
elif o in ("-b", "--brute-force"):
CONFIG['brute-force'] = int(a)
elif o in ("-v", "--verbose") and not CONFIG['quiet']:
CONFIG['verbose'] = True
CONFIG['verbose'] += 1
else:
return_code = False
@ -197,6 +199,9 @@ def main():
output = EOutput(CONFIG['quiet'])
ret = scan_upstream(package)
if CONFIG['verbose'] > 2:
httplib.HTTPConnection.debuglevel = 1
print ()
for url, version in ret: