Adopt exponential backoff when retrying

The current naive approach at retrying is not exactly gentle: if the server is overloaded we retry right ahead making the problem just worse. See https://encore.dev/blog/retries for a fun explanation of how the naive approach can be catastrophic. Fortunately the `tenacity` library allow us to add a bounded exponential backoff while also making the code easier to understand. This should make the dashboard behave much better toward GitLab and OBS when they hit some issue. Signed-off-by: Emanuele Aina <emanuele.aina@collabora.com>

Adopt exponential backoff when retrying
The current naive approach at retrying is not exactly gentle: if the server is overloaded we retry right ahead making the problem just worse. See https://encore.dev/blog/retries for a fun explanation of how the naive approach can be catastrophic. Fortunately the `tenacity` library allow us to add a bounded exponential backoff while also making the code easier to understand. This should make the dashboard behave much better toward GitLab and OBS when they hit some issue. Signed-off-by: Emanuele Aina <emanuele.aina@collabora.com>
9fc43368 · Emanuele Aina · 77d6f670 · 9fc43368 · 9fc43368
Commit 9fc43368 authored 1 year ago by Emanuele Aina
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -136,6 +136,7 @@ packaging-data-fetch-downstream:
        ca-certificates
        python3-debian
        python3-gitlab
+        python3-tenacity
        python3-yaml
        wget
  script:
@@ -288,6 +289,7 @@ packaging-data-fetch-obs:
        python3-debian
        python3-gitlab
        python3-m2crypto
+        python3-tenacity
        python3-yaml
        osc
  script:

--- a/bin/utils.py
+++ b/bin/utils.py
 import collections.abc
 import concurrent.futures
 import logging
-import sys
+import traceback
+
+import tenacity
+
+
+def _retrying_on_exception(exception_type=Exception):
+    retrying = tenacity.Retrying(
+        stop=tenacity.stop_after_delay(600),
+        retry=tenacity.retry_if_exception_type(exception_type),
+        wait=tenacity.wait_exponential_jitter(max=60),
+    )
+    return retrying


 def item_id(item):
@@ -24,24 +35,19 @@ def item_id(item):

 def thread_pool(num_workers, func, items, num_retries=0):
    def func_with_retries(item, counter, total):
-        for i in range(1, num_retries + 2):
-            itemid = item_id(item)
+        itemid = item_id(item)
+        for attempt in _retrying_on_exception():
+            i = attempt.retry_state.attempt_number
            logging.debug(f"Fetching item {counter} of {total} ({itemid}), round #{i}")
-            try:
+            with attempt:
                func(item)
-            except Exception:
-                if i == num_retries + 1:  # this was the last retry
-                    logging.error(
-                        f"Failed processing item {counter} of {total} ({itemid}), round #{i}"
-                    )
-                    raise
+            if attempt.retry_state.outcome.failed:
                logging.error(
-                    f"Retry on failure fetching item {counter} of {total} ({itemid}), round #{i}"
+                    f"Failed processing item {counter} of {total} ({itemid}), round #{i}"
                )
-                if logging.DEBUG >= logging.root.level:
-                    sys.excepthook(*sys.exc_info())
-            else:
-                break
+                if logging.DEBUG >= logging.root.level and i != num_retries:
+                    ex = attempt.retry_state.outcome.exception()
+                    traceback.print_exception(type(ex), ex, ex.__traceback__)

    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        items = list(items)