diff --git a/scripts/test_urls.py b/scripts/test_urls.py index 96eb1a018ab896e519e904f0d661df46ffdc5e7d..c23157453c59f20bcc38085717ab752bef3a581d 100755 --- a/scripts/test_urls.py +++ b/scripts/test_urls.py @@ -20,6 +20,8 @@ EXCLUDE = [ "*://phabricator.apertis.org/T*", # it's not public anyway :( ] +urls = set() + def get_link(url): link = url.group("link") @@ -35,8 +37,7 @@ def get_link(url): if fnmatch.fnmatch(link, exclude): return - if not link in urls: - urls.append(link) + urls.add(link) def parse_file(filename): @@ -56,8 +57,6 @@ def parse_file(filename): pattern = re.compile("href=\"(?P<link>.*?)\"") doc = pattern.sub(get_link, text) -urls = [] - # Parse aliases for root, dirs, files in os.walk(sys.argv[1]): for file in files: @@ -67,6 +66,7 @@ for root, dirs, files in os.walk(sys.argv[1]): except: print("Failed to parse %s/%s" % (root, file)) +urls = list(urls) # This seems to trigger issues with sites not wanting us to crawl them #urls.sort() # So let's randomise the list to see if that helps...