From bddc195e1752dde921980c2c75cee0ca1a448e5e Mon Sep 17 00:00:00 2001
From: Emanuele Aina <emanuele.aina@collabora.com>
Date: Fri, 9 Jul 2021 10:33:04 +0200
Subject: [PATCH] test_urls: Ratelimit some origins

The GNOME wiki is unhappy about our requests, be nice.

Signed-off-by: Emanuele Aina <emanuele.aina@collabora.com>
---
 scripts/test_urls.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/scripts/test_urls.py b/scripts/test_urls.py
index ca5f17ac4..41a335f16 100755
--- a/scripts/test_urls.py
+++ b/scripts/test_urls.py
@@ -16,6 +16,9 @@ import concurrent.futures
 import time
 import fnmatch
 import textwrap
+import itertools
+
+PARALLEL = 50
 
 EXCLUDE = [
     "*://lavaphabbridge.apertis.org",   #
@@ -23,6 +26,13 @@ EXCLUDE = [
     "*://phabricator.apertis.org/T*",   # it's not public anyway :(
 ]
 
+RATELIMITED = [
+    "live.gnome.org",
+    "wiki.gnome.org",
+]
+
+RATELIMIT_DELAY_SECONDS = 2
+
 urls = set()
 
 def get_link(url):
@@ -105,6 +115,7 @@ def url_check(url):
         # Some servers aren't setup to handle HEAD requests, so check anything
         # that's not got a 200 status code with GET as well.
         if not status:
+            time.sleep(RATELIMIT_DELAY_SECONDS)
             resp = None
             resp = session.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
             status = resp.ok
@@ -118,10 +129,27 @@ def url_check(url):
     print(url, "OK" if status else "FAIL", resp.status_code if resp else "-", f"{end - start:0.4f}s")
     return url, status
 
+def urls_check(urls):
+    results = []
+    for url in urls:
+        results.append(url_check(url))
+        time.sleep(RATELIMIT_DELAY_SECONDS)
+    return results
+
 print(f"Testing {len(urls)} URLs")
+
+parallel = {}
+for url in urls:
+    origin = urlparse(url).netloc
+    if urlparse(url).netloc in RATELIMITED:
+        parallel.setdefault(origin, []).append(url)
+    else:
+        parallel[url] = [url]
+
 with concurrent.futures.ThreadPoolExecutor(max_workers=PARALLEL) as executor:
     broken = []
-    for url, status in executor.map(url_check, urls):
+    results = itertools.chain(*executor.map(urls_check, parallel.values()))
+    for url, status in results:
         if not status:
             broken.append(url)
 
-- 
GitLab