diff --git a/scripts/test_urls.py b/scripts/test_urls.py index ca5f17ac4a1dd832e9475b42b9c23ba0235a1574..41a335f163b33145b6f9a462cf4d035229b1dafc 100755 --- a/scripts/test_urls.py +++ b/scripts/test_urls.py @@ -16,6 +16,9 @@ import concurrent.futures import time import fnmatch import textwrap +import itertools + +PARALLEL = 50 EXCLUDE = [ "*://lavaphabbridge.apertis.org", # @@ -23,6 +26,13 @@ EXCLUDE = [ "*://phabricator.apertis.org/T*", # it's not public anyway :( ] +RATELIMITED = [ + "live.gnome.org", + "wiki.gnome.org", +] + +RATELIMIT_DELAY_SECONDS = 2 + urls = set() def get_link(url): @@ -105,6 +115,7 @@ def url_check(url): # Some servers aren't setup to handle HEAD requests, so check anything # that's not got a 200 status code with GET as well. if not status: + time.sleep(RATELIMIT_DELAY_SECONDS) resp = None resp = session.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False) status = resp.ok @@ -118,10 +129,27 @@ def url_check(url): print(url, "OK" if status else "FAIL", resp.status_code if resp else "-", f"{end - start:0.4f}s") return url, status +def urls_check(urls): + results = [] + for url in urls: + results.append(url_check(url)) + time.sleep(RATELIMIT_DELAY_SECONDS) + return results + print(f"Testing {len(urls)} URLs") + +parallel = {} +for url in urls: + origin = urlparse(url).netloc + if urlparse(url).netloc in RATELIMITED: + parallel.setdefault(origin, []).append(url) + else: + parallel[url] = [url] + with concurrent.futures.ThreadPoolExecutor(max_workers=PARALLEL) as executor: broken = [] - for url, status in executor.map(url_check, urls): + results = itertools.chain(*executor.map(urls_check, parallel.values())) + for url, status in results: if not status: broken.append(url)