diff --git a/scripts/test_urls.py b/scripts/test_urls.py new file mode 100755 index 0000000000000000000000000000000000000000..62c3820b2c2e00e4a7322d5e524fc8cb6d214429 --- /dev/null +++ b/scripts/test_urls.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 + +# Find external links and test to see if they work. + +import markdown +import os +import random +import re +import requests +import sys +import traceback +from urllib.parse import urlparse +import urllib3 + + +def get_link(url): + link = url.group("link") + + if not link[:4] == "http": + return + + # Strip fragments so we only test URLs once + url = urlparse(link) + url = url._replace(fragment="") + link = url.geturl() + + if not link in urls: + urls.append(link) + + +def parse_file(filename): + # print("%s: " % filename) + with open(filename, "r+") as file: + contents = file.read() + + if not contents[0:3] == "+++": + return + + fm = contents.split("+++")[1] + doc = contents.split("+++", 2)[2] + + # Convert to HTML to simplify link detection + text = markdown.markdown(doc) + + pattern = re.compile("href=\"(?P<link>.*?)\"") + doc = pattern.sub(get_link, text) + +urls = [] + +# Parse aliases +for root, dirs, files in os.walk(sys.argv[1]): + for file in files: + if ".md" in file: + try: + parse_file("%s/%s" % (root, file)) + except: + print("Failed to parse %s/%s" % (root, file)) + +# This seems to trigger issues with sites not wanting us to crawl them +#urls.sort() +# So let's randomise the list to see if that helps... +random.shuffle(urls) + +# Get rid of warning from not verifying certs +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +# Some sites don't like that we're not a browser - so pretend to be Chrome... +headers={ + "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", + "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36", + } + +broken = 0 +for url in urls: + print("%s : " %(url), end='') + sys.stdout.flush() + try: + resp = requests.head(url, headers=headers, allow_redirects=True, timeout=60, verify=False) + status = resp.ok + resp.close() + except Exception as e: + pass + + try: + # Some servers aren't setup to handle HEAD requests, so check anything + # that's not got a 200 status code with GET as well. + if not status: + resp = requests.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False) + status = resp.ok + resp.close() + except Exception as e: + status = False + + if status: + print("OK") + else: + print("Fail") + broken += 1 + +print("Found %d broken URLs in %d tested" %(broken, len(urls))) + +if broken: + sys.exit(1) + +sys.exit(0)