test_urls.py

#!/usr/bin/env python3

# Find external links and test to see if they work.

import markdown
import os
import random
import re
import requests
import sys
import traceback
from urllib.parse import urlparse
import urllib3
import time
import fnmatch
import textwrap

EXCLUDE = [
    "*://lavaphabbridge.apertis.org",   #
    "*://lavaphabbridge.apertis.org/*", # it's slooooooow
    "*://phabricator.apertis.org/T*",   # it's not public anyway :(
]

urls = set()

def get_link(url):
    link = url.group("link")

    if not link[:4] == "http":
        return

    # Strip fragments so we only test URLs once
    url = urlparse(link)
    url = url._replace(fragment="")
    link = url.geturl()

    for exclude in EXCLUDE:
        if fnmatch.fnmatch(link, exclude):
            return

    urls.add(link)


def parse_file(filename):
    # print("%s: " % filename)
    with open(filename, "r+") as file:
        contents = file.read()

        if not contents[0:3] == "+++":
            return

        fm = contents.split("+++")[1]
        doc = contents.split("+++", 2)[2]

        # Convert to HTML to simplify link detection
        text = markdown.markdown(doc)

        pattern = re.compile("href=\"(?P<link>.*?)\"")
        doc = pattern.sub(get_link, text)

# Parse aliases
for root, dirs, files in os.walk(sys.argv[1]):
    for file in files:
        if ".md" in file:
            try:
                parse_file("%s/%s" % (root, file))
            except:
                print("Failed to parse %s/%s" % (root, file))

urls = list(urls)
# This seems to trigger issues with sites not wanting us to crawl them
#urls.sort()
# So let's randomise the list to see if that helps...
random.shuffle(urls)

# Get rid of warning from not verifying certs
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Some sites don't like that we're not a browser - so pretend to be Chrome...
headers={
    "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    }

broken = []
for url in urls:
    print("%s : " %(url), end='')
    sys.stdout.flush()
    start = time.perf_counter()
    status = None
    resp = None
    try:
        resp = requests.head(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
        status = resp.ok
        resp.close()
    except Exception as e:
        e_str = textwrap.indent(str(type(e)) + "\n" + str(e), "  ")
        print(f"ERROR(1): {url} {resp.status_code if resp else '-'}\n{e_str}")

    try:
        # Some servers aren't setup to handle HEAD requests, so check anything
        # that's not got a 200 status code with GET as well.
        if not status:
            resp = None
            resp = requests.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
            status = resp.ok
            resp.close()
    except Exception as e:
        status = False
        e_str = textwrap.indent(str(type(e)) + "\n" + str(e), "  ")
        print(f"ERROR(2): {url} {resp.status_code if resp else '-'}\n{e_str}")
    end = time.perf_counter()

    if not status:
        broken.append(url)
    print(url, "OK" if status else "FAIL", f"{end - start:0.4f}s")

print(f"Found {len(broken)} broken URLs in {len(urls)} tested:")
for b in broken:
    print(" ", b)

sys.exit(1 if broken else 0)