From 0936297f985b108c8caef08882c07d5959047263 Mon Sep 17 00:00:00 2001
From: Martyn Welch <martyn.welch@collabora.com>
Date: Mon, 21 Jun 2021 18:06:17 +0100
Subject: [PATCH] Test links use in Apertis website

The Apertis website contains many links, some of these will be dead. This
script parses the Markdown documents to find links which are then made into
a unique list and tested.

Signed-off-by: Martyn Welch <martyn.welch@collabora.com>
---
 scripts/test_urls.py | 105 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100755 scripts/test_urls.py

diff --git a/scripts/test_urls.py b/scripts/test_urls.py
new file mode 100755
index 000000000..62c3820b2
--- /dev/null
+++ b/scripts/test_urls.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# Find external links and test to see if they work.
+
+import markdown
+import os
+import random
+import re
+import requests
+import sys
+import traceback
+from urllib.parse import urlparse
+import urllib3
+
+
+def get_link(url):
+    link = url.group("link")
+
+    if not link[:4] == "http":
+        return
+
+    # Strip fragments so we only test URLs once
+    url = urlparse(link)
+    url = url._replace(fragment="")
+    link = url.geturl()
+
+    if not link in urls:
+        urls.append(link)
+
+
+def parse_file(filename):
+    # print("%s: " % filename)
+    with open(filename, "r+") as file:
+        contents = file.read()
+
+        if not contents[0:3] == "+++":
+            return
+
+        fm = contents.split("+++")[1]
+        doc = contents.split("+++", 2)[2]
+
+        # Convert to HTML to simplify link detection
+        text = markdown.markdown(doc)
+
+        pattern = re.compile("href=\"(?P<link>.*?)\"")
+        doc = pattern.sub(get_link, text)
+
+urls = []
+
+# Parse aliases
+for root, dirs, files in os.walk(sys.argv[1]):
+    for file in files:
+        if ".md" in file:
+            try:
+                parse_file("%s/%s" % (root, file))
+            except:
+                print("Failed to parse %s/%s" % (root, file))
+
+# This seems to trigger issues with sites not wanting us to crawl them
+#urls.sort()
+# So let's randomise the list to see if that helps...
+random.shuffle(urls)
+
+# Get rid of warning from not verifying certs
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+# Some sites don't like that we're not a browser - so pretend to be Chrome...
+headers={
+    "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
+    }
+
+broken = 0
+for url in urls:
+    print("%s : " %(url), end='')
+    sys.stdout.flush()
+    try:
+        resp = requests.head(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
+        status = resp.ok
+        resp.close()
+    except Exception as e:
+        pass
+
+    try:
+        # Some servers aren't setup to handle HEAD requests, so check anything
+        # that's not got a 200 status code with GET as well.
+        if not status:
+            resp = requests.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
+            status = resp.ok
+            resp.close()
+    except Exception as e:
+        status = False
+
+    if status:
+        print("OK")
+    else:
+        print("Fail")
+        broken += 1
+
+print("Found %d broken URLs in %d tested" %(broken, len(urls)))
+
+if broken:
+    sys.exit(1)
+
+sys.exit(0)
-- 
GitLab