Skip to content
Snippets Groups Projects
Commit 0936297f authored by Martyn Welch's avatar Martyn Welch
Browse files

Test links use in Apertis website


The Apertis website contains many links, some of these will be dead. This
script parses the Markdown documents to find links which are then made into
a unique list and tested.

Signed-off-by: default avatarMartyn Welch <martyn.welch@collabora.com>
parent b3add43e
No related branches found
No related tags found
1 merge request!267Test and fix links use in Apertis website
#!/usr/bin/env python3
# Find external links and test to see if they work.
import markdown
import os
import random
import re
import requests
import sys
import traceback
from urllib.parse import urlparse
import urllib3
def get_link(url):
link = url.group("link")
if not link[:4] == "http":
return
# Strip fragments so we only test URLs once
url = urlparse(link)
url = url._replace(fragment="")
link = url.geturl()
if not link in urls:
urls.append(link)
def parse_file(filename):
# print("%s: " % filename)
with open(filename, "r+") as file:
contents = file.read()
if not contents[0:3] == "+++":
return
fm = contents.split("+++")[1]
doc = contents.split("+++", 2)[2]
# Convert to HTML to simplify link detection
text = markdown.markdown(doc)
pattern = re.compile("href=\"(?P<link>.*?)\"")
doc = pattern.sub(get_link, text)
urls = []
# Parse aliases
for root, dirs, files in os.walk(sys.argv[1]):
for file in files:
if ".md" in file:
try:
parse_file("%s/%s" % (root, file))
except:
print("Failed to parse %s/%s" % (root, file))
# This seems to trigger issues with sites not wanting us to crawl them
#urls.sort()
# So let's randomise the list to see if that helps...
random.shuffle(urls)
# Get rid of warning from not verifying certs
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# Some sites don't like that we're not a browser - so pretend to be Chrome...
headers={
"Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
}
broken = 0
for url in urls:
print("%s : " %(url), end='')
sys.stdout.flush()
try:
resp = requests.head(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
status = resp.ok
resp.close()
except Exception as e:
pass
try:
# Some servers aren't setup to handle HEAD requests, so check anything
# that's not got a 200 status code with GET as well.
if not status:
resp = requests.get(url, headers=headers, allow_redirects=True, timeout=60, verify=False)
status = resp.ok
resp.close()
except Exception as e:
status = False
if status:
print("OK")
else:
print("Fail")
broken += 1
print("Found %d broken URLs in %d tested" %(broken, len(urls)))
if broken:
sys.exit(1)
sys.exit(0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment