diff --git a/.gitignore b/.gitignore
index 894a44c..315ba54 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
# mypy
.mypy_cache/
+
+# Test Site Output
+_site*
diff --git a/check-links-3.py b/check-links-3.py
index 195691f..d6840a1 100644
--- a/check-links-3.py
+++ b/check-links-3.py
@@ -23,402 +23,437 @@
# of filename/link pairs.
-CHROME = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) '
- 'AppleWebKit/537.36 (KHTML, like Gecko) '
- 'Chrome/41.0.2228.0 Safari/537.36'
-}
-
-
-def drop_dot(foo):
- if foo != "" and foo[0] == '.':
- return foo[1:]
- return foo
-
-
-def get_all_html_files(path):
- result = []
- for root, dirs, files in os.walk(path):
- for name in files:
- if name.endswith((".html", ".htm")):
- f = os.path.join(root, name)
- if f not in result:
- if verbose >= 3:
- print("File scan: adding '%s'" % f)
- result.append(f)
- for d in dirs:
- files_in_d = get_all_html_files(join(root, d))
- if files_in_d:
- for f in files_in_d:
+class JekyllLinkChecker:
+ def __init__(self):
+ self.CHROME = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) '
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/41.0.2228.0 Safari/537.36'
+ }
+ self.file_link_pairs = []
+ self.unique_links = []
+ self.failed_links = []
+ self.unique_links = []
+ self.skip_elements = {
+ "a": "edit_on_github"
+ }
+ self.status_count = 0
+ self.html_cache_results = {}
+ self.dns_skip = []
+ self.verbose = 0
+ self.output_file = None
+ self.args = self.parse_args()
+ self.main()
+
+ def main(self):
+ """
+ Main method - runs on instantiation
+ """
+ print("Linaro Link Checker")
+ if self.args.verbose is not None:
+ self.verbose = self.args.verbose
+ print("Verbosity is at level %s" % self.verbose)
+ if self.args.skip_dns_check is not None:
+ print("Loading FQDN skip list from %s" % self.args.skip_dns_check)
+ try:
+ self.dns_skip = list(open(self.args.skip_dns_check))
+ except Exception as exception:
+ print("Couldn't load FQDN skip list:", exception)
+ if self.args.output is not None:
+ self.output_file = self.args.output
+ if self.args.directory is not None:
+ print("Scanning '%s'" % self.args.directory)
+ os.chdir(self.args.directory)
+ if self.args.nointernal:
+ print("Skipping internal link checking")
+ if self.args.noexternal:
+ print("Skipping external link checking")
+ # For now, assume that we're just scanning the current directory. Add code
+ # for file paths and possibly URLs at a future date ...
+ self.scan_directory("./", self.args.skip_path)
+
+ def parse_args(self):
+ parser = argparse.ArgumentParser(
+ description="Scan for broken links")
+ parser.add_argument('-d', '--directory', nargs='?', default=None,
+ help='specifies the directory to scan')
+ parser.add_argument('--skip-dns-check', nargs='?', default=None,
+ help='specifies text file of FQDNs to skip the DNS '
+ 'check on')
+ parser.add_argument('-s', '--skip-path', action='append',
+ help='specifies a path to skip when checking URLs', default=None)
+ parser.add_argument('-v', '--verbose', action='count')
+ parser.add_argument('-f', '--file', action='append',
+ help=('specifies a file to check;'
+ ' all non-specified files are ignored'))
+ parser.add_argument('--nointernal', action='store_true',
+ help='skips checking of internal references')
+ parser.add_argument('--noexternal', action='store_true',
+ help='skips checking of external references')
+ parser.add_argument('-o', '--output', nargs='?', default=None,
+ help='specifies output file for error results')
+ parser.add_argument('--no-external-errors', action='store_true',
+ help='ignores errors caused by external broken links')
+ return parser.parse_args()
+
+ def drop_dot(self, foo):
+ if foo != "" and foo[0] == '.':
+ return foo[1:]
+ return foo
+
+ def get_all_html_files(self, path):
+ result = []
+ for root, dirs, files in os.walk(path):
+ for name in files:
+ if name.endswith((".html", ".htm")):
+ f = os.path.join(root, name)
if f not in result:
- if verbose >= 3:
+ if self.verbose >= 3:
print("File scan: adding '%s'" % f)
result.append(f)
- return result
-
-
-def validate_file_link(filename, text):
- # If there is an anchor (#) in the text, we need to look at what
- # comes before it.
- text = text.split("#")[0]
- # If there is a query (?) in the text, we need to look at what
- # comes before it.
- text = text.split("?")[0]
- # If "text" starts with "/" then we need to be looking at the
- # path relative to where we started scanning.
- #
- # Otherwise, it will be relative to where the current file is
- # located.
- if text[0] == "/":
- head = "."
- else:
- # Text will be pointing at a directory or file, relative to
- # where the parent file is living.
- # head gets us the directory where the parent file lives.
- head, tail = os.path.split(filename)
- if head[-1] != '/' and text[0] != '/':
- combined_path = "%s/%s" % (head, text)
- else:
- combined_path = "%s%s" % (head, text)
- # If the path contains a double-slash, that works on the OS but not in the
- # browser so we need to explicitly check for it.
- if "//" in combined_path:
- return combined_path
- # If we're looking at a directory, make sure there is an index file in it.
- if combined_path[-1] == '/':
- combined_path += "index.html"
- if verbose >= 2:
- print(("Validating file: constituent parts are '%s' and '%s',"
- " combined path is '%s'") % (head, text, combined_path))
- # needs to be a file or directory ...
- result = os.path.exists(combined_path)
- if result:
- return None
- else:
- return combined_path
-
-
-def matched_skip(text, skip_list):
- if skip_list is not None:
- for s in skip_list:
- if text.startswith(s):
- return True
- return False
-
-
-def validate_link(filename, text):
- global file_link_pairs
- global unique_links
- # Clean up the text first ...
- if text is not None:
- text = text.strip()
- if text is None or text == "" or text[0] == "#":
- # or matched_redirect(text):
- return None
- else:
- # Some links don't have the transport on them to ensure that they work
- # whether the user is coming via http or https, so add it if it is
- # missing.
- if len(text) > 2 and text[:2] == "//":
- text = "https:" + text
- # Check the URL to see if it is a web link - that is all we check.
- o = urlparse(text)
- if not args.noexternal and (o.scheme == "http" or o.scheme == "https"):
- # We use "file_link_pairs" to track which files reference which
- # URLs - we only check URLs *once* but then flag up all
- # refernces to the link.
- if [filename, text] not in file_link_pairs:
- file_link_pairs.append([filename, text])
- # ... only check the links once!
- if text not in unique_links:
- unique_links.append(text)
- return None # Postpone the decision for now ...
- elif not args.nointernal and o.scheme == "":
- return validate_file_link(filename, text)
- # If skipping stuff, return the answer of no problems ...
- return None
-
-
-def output_status(code, value):
- global status_count
-
- if status_count % 100 == 0:
- end = "\n"
- else:
- end = ""
- print(code, end=end, flush=True)
- status_count += 1
- return value
-
-
-async def async_check_link(session, url):
- # Check that the host resolves, but only if it isn't in the DNS skip list
- parts = urlparse(url)
- if parts.netloc not in dns_skip:
+ for d in dirs:
+ files_in_d = self.get_all_html_files(join(root, d))
+ if files_in_d:
+ for f in files_in_d:
+ if f not in result:
+ if self.verbose >= 3:
+ print("File scan: adding '%s'" % f)
+ result.append(f)
+ return result
+
+ def validate_file_link(self, filename, text):
+ # If there is an anchor (#) in the text, we need to look at what
+ # comes before it.
+ text = text.split("#")[0]
+ # If there is a query (?) in the text, we need to look at what
+ # comes before it.
+ text = text.split("?")[0]
+ # If "text" starts with "/" then we need to be looking at the
+ # path relative to where we started scanning.
+ #
+ # Otherwise, it will be relative to where the current file is
+ # located.
+ if text[0] == "/":
+ head = "."
+ else:
+ # Text will be pointing at a directory or file, relative to
+ # where the parent file is living.
+ # head gets us the directory where the parent file lives.
+ head, tail = os.path.split(filename)
+ if head[-1] != '/' and text[0] != '/':
+ combined_path = "%s/%s" % (head, text)
+ else:
+ combined_path = "%s%s" % (head, text)
+ # If the path contains a double-slash, that works on the OS but not in the
+ # browser so we need to explicitly check for it.
+ if "//" in combined_path:
+ return combined_path
+ # If we're looking at a directory, make sure there is an index file in it.
+ if combined_path[-1] == '/':
+ combined_path += "index.html"
+ if self.verbose >= 2:
+ print(("Validating file: constituent parts are '%s' and '%s',"
+ " combined path is '%s'") % (head, text, combined_path))
+ # needs to be a file or directory ...
+ result = os.path.exists(combined_path)
+ if result:
+ return None
+ else:
+ return combined_path
+
+ def matched_skip(self, text, skip_list):
+ if skip_list is not None:
+ for s in skip_list:
+ if text.startswith(s):
+ return True
+ return False
+
+ def validate_link(self, filename, text):
+ # Clean up the text first ...
+ if text is not None:
+ text = text.strip()
+ if text is None or text == "" or text[0] == "#":
+ # or matched_redirect(text):
+ return None
+ else:
+ # Some links don't have the transport on them to ensure that they work
+ # whether the user is coming via http or https, so add it if it is
+ # missing.
+ if len(text) > 2 and text[:2] == "//":
+ text = "https:" + text
+ # Check the URL to see if it is a web link - that is all we check.
+ o = urlparse(text)
+ if not self.args.noexternal and (o.scheme == "http" or o.scheme == "https"):
+ # We use "self.file_link_pairs" to track which files reference which
+ # URLs - we only check URLs *once* but then flag up all
+ # refernces to the link.
+ if [filename, text] not in self.file_link_pairs:
+ self.file_link_pairs.append([filename, text])
+ # ... only check the links once!
+ if text not in self.unique_links:
+ self.unique_links.append(text)
+ return None # Postpone the decision for now ...
+ elif not self.args.nointernal and o.scheme == "":
+ return self.validate_file_link(filename, text)
+ # If skipping stuff, return the answer of no problems ...
+ return None
+
+ def output_status(self, code, value):
+
+ if self.status_count % 100 == 0:
+ end = "\n"
+ else:
+ end = ""
+ print(code, end=end, flush=True)
+ self.status_count += 1
+ return value
+
+ async def async_check_link(self, session, url):
+ # Check that the host resolves, but only if it isn't in the DNS skip list
+ parts = urlparse(url)
+ if parts.netloc not in self.dns_skip:
+ try:
+ socket.gethostbyname(parts.netloc) # noqa
+ except socket.gaierror as err:
+ return self.output_status('D', 1)
+ # Now try to validate the URL
try:
- foo = socket.gethostbyname(parts.netloc) # noqa
- except socket.gaierror as err:
- return output_status('D', 1)
- # Now try to validate the URL
- try:
- async with session.head(
- url,
- allow_redirects=True,
- headers=CHROME) as response:
- if response.status == 404 or response.status == 405:
- # Some sites return 404/405 for HEAD requests, so we need to
- # double-check with a full request.
- async with session.get(
- url,
- allow_redirects=True,
- headers=CHROME) as response:
- if response.status != 404 and response.status != 405:
- return output_status('.', 0)
- return output_status('X', response.status)
- else:
- if (response.status < 400 or
- response.status > 499):
- return output_status('.', 0)
+ async with session.head(
+ url,
+ allow_redirects=True,
+ headers=self.CHROME) as response:
+ if response.status == 404 or response.status == 405:
+ # Some sites return 404/405 for HEAD requests, so we need to
+ # double-check with a full request.
+ async with session.get(
+ url,
+ allow_redirects=True,
+ headers=self.CHROME) as response:
+ if response.status != 404 and response.status != 405:
+ return self.output_status('.', 0)
+ return self.output_status('X', response.status)
else:
- if verbose >= 3:
- print(response.status, response.url)
- # We only really care about full-on failures, i.e. 404.
- # Other status codes can be returned just because we aren't
- # using a browser, even if we do provide the agent string
- # for Chrome.
- return output_status('_', 0)
- # (Non-)Fatal errors
- except socket.gaierror as err:
- print("Error while checking %s: %s" % (url, err))
- return output_status('a', -2)
- # Non-fatal errors, but indicate which error we are getting
- except aiohttp.client_exceptions.ClientConnectorError:
- return output_status('b', -3)
- except aiohttp.client_exceptions.ServerTimeoutError:
- return output_status('c', -4)
- except concurrent.futures._base.CancelledError:
- return output_status('d', -5)
- except concurrent.futures._base.TimeoutError:
- return output_status('e', -6)
- except aiohttp.client_exceptions.ClientOSError:
- return output_status('f', -7)
- except aiohttp.client_exceptions.ServerDisconnectedError:
- return output_status('g', -8)
- except aiohttp.client_exceptions.ClientResponseError:
- return output_status('h', -9)
-
-
-async def async_check_web(session, links):
- results = await asyncio.gather(
- *[async_check_link(session, url) for url in links]
- )
- # That gets us a collection of the responses, matching up to each of
- # the tasks, so loop through the links again and the index counter
- # will point to the corresponding result.
- i = 0
- for l in links:
- if l not in html_cache_results:
- if results[i] == 0:
- html_cache_results[l] = None
- elif results[i] > 0:
- html_cache_results[l] = "%s [%d]" % (l, results[i])
- i += 1
-
-
-# Perform an async check of all of the web links we've collected then
-# build up a list of the affected files for the faulty links.
-async def check_unique_links():
- global status_count
- status_count = 1
-
- web_failed_links = []
- print("Checking %s web links ..." % len(unique_links))
- # Force IPv4 only to avoid
- # https://stackoverflow.com/questions/40347726/python-3-5-asyincio-and-aiohttp-errno-101-network-is-unreachable
- conn = aiohttp.TCPConnector(
- family=socket.AF_INET,
- verify_ssl=False,
- limit=500
- )
- async with aiohttp.ClientSession(connector=conn,
- conn_timeout=60) as session:
- await async_check_web(session, unique_links)
- for p in file_link_pairs:
- # p[0] is the file path and p[1] is the URL.
- if (p[1] in html_cache_results and
- html_cache_results[p[1]] is not None):
- error = [p[0], html_cache_results[p[1]]]
- if error not in web_failed_links:
- web_failed_links.append(error)
- return web_failed_links
-
-
-# For the specified file, read it in and then check all of the links in it.
-def check_file(filename, skip_list):
- file_failed_links = []
- if not matched_skip(filename, skip_list):
- try:
- with open(filename, "r") as myfile:
- data = myfile.read()
- soup = BeautifulSoup(data, 'html.parser')
- a_links = soup.find_all('a')
- # Linaro specific ... find any "edit on GitHub" links so that
- # they can be EXCLUDED from the list of links to check. The reason
- # why is because if this is a new page (i.e. in a Pull Request),
- # the file won't exist in the repository yet and so the link to
- # the page would fail.
- gh_links = soup.find_all('a', id="edit_on_github")
+ if (response.status < 400 or
+ response.status > 499):
+ return self.output_status('.', 0)
+ else:
+ if self.verbose >= 3:
+ print(response.status, response.url)
+ # We only really care about full-on failures, i.e. 404.
+ # Other status codes can be returned just because we aren't
+ # using a browser, even if we do provide the agent string
+ # for Chrome.
+ return self.output_status('_', 0)
+ # (Non-)Fatal errors
+ except socket.gaierror as err:
+ print("Error while checking %s: %s" % (url, err))
+ return self.output_status('a', -2)
+ # Non-fatal errors, but indicate which error we are getting
+ except aiohttp.client_exceptions.ClientConnectorError:
+ return self.output_status('b', -3)
+ except aiohttp.client_exceptions.ServerTimeoutError:
+ return self.output_status('c', -4)
+ except concurrent.futures._base.CancelledError:
+ return self.output_status('d', -5)
+ except concurrent.futures._base.TimeoutError:
+ return self.output_status('e', -6)
+ except aiohttp.client_exceptions.ClientOSError:
+ return self.output_status('f', -7)
+ except aiohttp.client_exceptions.ServerDisconnectedError:
+ return self.output_status('g', -8)
+ except aiohttp.client_exceptions.ClientResponseError:
+ return self.output_status('h', -9)
+
+ async def async_check_web(self, session, links):
+ results = await asyncio.gather(
+ *[self.async_check_link(session, url) for url in links]
+ )
+ # That gets us a collection of the responses, matching up to each of
+ # the tasks, so loop through the links again and the index counter
+ # will point to the corresponding result.
+ i = 0
+ for l in links:
+ if l not in self.html_cache_results:
+ if results[i] == 0:
+ self.html_cache_results[l] = None
+ elif results[i] > 0:
+ self.html_cache_results[l] = "%s [%d]" % (l, results[i])
+ i += 1
+
+ # Perform an async check of all of the web links we've collected then
+ # build up a list of the affected files for the faulty links.
+
+ async def check_unique_links(self):
+ self.status_count = 1
+
+ web_failed_links = []
+ print("Checking %s web links ..." % len(self.unique_links))
+ # Force IPv4 only to avoid
+ # https://stackoverflow.com/questions/40347726/python-3-5-asyincio-and-aiohttp-errno-101-network-is-unreachable
+ conn = aiohttp.TCPConnector(
+ family=socket.AF_INET,
+ verify_ssl=False,
+ limit=500
+ )
+ async with aiohttp.ClientSession(connector=conn,
+ conn_timeout=60) as session:
+ await self.async_check_web(session, self.unique_links)
+ for p in self.file_link_pairs:
+ # p[0] is the file path and p[1] is the URL.
+ if (p[1] in self.html_cache_results and
+ self.html_cache_results[p[1]] is not None):
+ error = [p[0], self.html_cache_results[p[1]]]
+ if error not in web_failed_links:
+ web_failed_links.append(error)
+ return web_failed_links
+
+ def remove_skip_elements(self, soup, a_links):
+ """
+ Removes any elements that have explicitly been set to skip
+ """
+ # Linaro specific ... find any "edit on GitHub" links so that
+ # they can be EXCLUDED from the list of links to check. The reason
+ # why is because if this is a new page (i.e. in a Pull Request),
+ # the file won't exist in the repository yet and so the link to
+ # the page would fail.
+ for skip_element_tag, skip_element_id in self.skip_elements.items():
+ gh_links = soup.find_all(skip_element_tag, id=skip_element_id)
for g in gh_links:
a_links.remove(g)
- for link in a_links:
- result = validate_link(filename, link.get('href'))
- if result is not None:
- error = [filename, result]
- if error not in file_failed_links:
- file_failed_links.append(error)
- # Check linked images
- img_links = soup.find_all('img')
- for link in img_links:
- result = validate_link(filename, link.get('src'))
- if result is not None:
- error = [filename, result]
- if error not in file_failed_links:
- file_failed_links.append(error)
- except Exception as exception:
- print("FAILED TO READ '%s' - %s" % (filename, str(exception)))
- return file_failed_links
-
-
-def failures_to_dict(list_of_failures):
- failure_dict = {}
- for f in list_of_failures:
- file = drop_dot(f[0])
- url = drop_dot(f[1])
- if file in failure_dict:
- failure_dict[file].append(url)
+ return a_links
+
+ def check_file(self, filename, skip_list):
+ """
+ For the specified file, read it in and then check all of the links in it.
+ """
+ file_failed_links = []
+ # Check file is not in skip list
+ if not self.matched_skip(filename, skip_list):
+ try:
+ # Retreive contents of file
+ with open(filename, "r") as my_file:
+ data = my_file.read()
+ # Setup new BeautifulSoup parser
+ soup = BeautifulSoup(data, 'html.parser')
+ # Find all Anchor tags ()
+ a_links = soup.find_all('a')
+ # Remove any elements that have been set to be skipped.
+ a_links = self.remove_skip_elements(soup, a_links)
+ # Loop over all links and check if links are valid
+ for link in a_links:
+ # Validate link
+ result = self.validate_link(filename, link.get('href'))
+ # Check to see if an error was found
+ if result is not None:
+ # Create new list
+ error = [filename, result]
+ # Check the error hasn't already been found for this file
+ if error not in file_failed_links:
+ # Append error THIS file's broken links.
+ file_failed_links.append(error)
+ # Check images that have a src="" attribute
+ # Lazy loaded images should also be checked
+ # TODD add data-src support.
+ images_list = soup.find_all('img')
+ for image in images_list:
+ # Validate link
+ result = self.validate_link(filename, image.get('src'))
+ # Check to see if result contains errors
+ if result is not None:
+ # Create new list
+ error = [filename, result]
+ if error not in file_failed_links:
+ file_failed_links.append(error)
+ except Exception as exception:
+ print("FAILED TO READ '%s' - %s" % (filename, str(exception)))
+ return file_failed_links
+
+ def failures_to_dict(self, list_of_failures):
+ failure_dict = {}
+ for failure in list_of_failures:
+ failed_file = self.drop_dot(failure[0])
+ url = self.drop_dot(failure[1])
+ if failed_file in failure_dict:
+ failure_dict[failed_file].append(url)
+ else:
+ failure_dict[failed_file] = [url]
+ return failure_dict
+
+ # Scan the specified directory, ignoring anything that matches skip_list.
+
+ def scan_directory(self, path, skip_list):
+ """
+ Scans a directory for html files to check for broken links
+ """
+ soft_failure = False
+ # Get the all the HTML files in
+ html_files = self.get_all_html_files(path)
+ # Get the total files we're checking
+ if self.args.file is not None:
+ total = len(self.args.file)
else:
- failure_dict[file] = [url]
- return failure_dict
-
-
-# Scan the specified directory, ignoring anything that matches skip_list.
-def scan_directory(path, skip_list):
- global failed_links
- global file_link_pairs
- global unique_links
- failed_links = []
- file_link_pairs = []
- unique_links = []
-
- soft_failure = False
-
- count = 1
- html_files = get_all_html_files(path)
- total = len(html_files)
- if args.file is not None:
- total = len(args.file)
- for hf in html_files:
- if args.file is None or hf in args.file:
- print("(%s/%s) Checking '%s'" % (count, total, hf))
- count += 1
- results = check_file(hf, skip_list)
- for r in results:
- if r not in failed_links:
- failed_links.append(r)
- if len(unique_links) == 0:
- print("No web links to check.")
- else:
- loop = asyncio.new_event_loop()
- asyncio.set_event_loop(loop)
- cul_result = loop.run_until_complete(check_unique_links())
- loop.close()
- # If we are NOT reporting broken external links as an error,
- # report them as warnings if there are any.
- if args.no_external_errors:
- if cul_result != []:
- print("\n\nWARNING! %s failed external links have been "
- "found:\n" % len(cul_result))
- report_failed_links(cul_result)
- soft_failure = True
+ total = len(html_files)
+ # Loop over all HTML files found
+ count = 1
+ for html_file in html_files:
+ # Check to see if a file check list is set
+ # and if the file is in the check list.
+ if self.args.file is None or html_file in self.args.file:
+ print("(%s/%s) Checking '%s'" % (count, total, html_file))
+ count += 1
+ # Check the file for broken links
+ results = self.check_file(html_file, skip_list)
+ for broken_link in results:
+ if broken_link not in self.failed_links:
+ self.failed_links.append(broken_link)
+
+ if len(self.unique_links) == 0:
+ print("No web links to check.")
else:
- # Can do a simple append here because these are all web failures
- # and so don't need to check if the failure already exists in the
- # list.
- failed_links += cul_result
- if failed_links != []:
- if output_file is not None:
- save_out = sys.stdout
- fsock = open(output_file, 'w')
- sys.stdout = fsock
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ cul_result = loop.run_until_complete(self.check_unique_links())
+ loop.close()
+ # If we are NOT reporting broken external links as an error,
+ # report them as warnings if there are any.
+ if self.args.no_external_errors:
+ if cul_result != []:
+ print("\n\nWARNING! %s failed external links have been "
+ "found:\n" % len(cul_result))
+ self.report_failed_links(cul_result)
+ soft_failure = True
+ else:
+ # Can do a simple append here because these are all web failures
+ # and so don't need to check if the failure already exists in the
+ # list.
+ self.failed_links += cul_result
+ if self.failed_links != []:
+ if self.output_file is not None:
+ save_out = sys.stdout
+ fsock = open(self.output_file, 'w')
+ sys.stdout = fsock
+ else:
+ print("")
+ print("%s failed links have been found:\n" %
+ len(self.failed_links))
+ self.report_failed_links(self.failed_links)
+ if self.output_file is not None:
+ sys.stdout = save_out
+ fsock.close()
+ sys.exit(1)
+ if soft_failure:
+ print("\nLinks have been checked; warnings reported.")
else:
- print("")
- print("%s failed links have been found:\n" % len(failed_links))
- report_failed_links(failed_links)
- if output_file is not None:
- sys.stdout = save_out
- fsock.close()
- sys.exit(1)
- if soft_failure:
- print("\nLinks have been checked; warnings reported.")
- else:
- print("\nLinks have been successfully checked.")
-
+ print("\nLinks have been successfully checked.")
-def report_failed_links(failed_links):
- failure_dict = failures_to_dict(failed_links)
- for file in sorted(failure_dict):
- print("%s:" % file)
- for ref in failure_dict[file]:
- print(" %s" % ref)
+ def report_failed_links(self, failed_links):
+ failure_dict = self.failures_to_dict(failed_links)
+ for file in sorted(failure_dict):
+ print("%s:" % file)
+ for ref in failure_dict[file]:
+ print(" %s" % ref)
if __name__ == '__main__':
- parser = argparse.ArgumentParser(description="Scan for broken links")
- parser.add_argument('-d', '--directory', nargs='?', default=None,
- help='specifies the directory to scan')
- parser.add_argument('--skip-dns-check', nargs='?', default=None,
- help='specifies text file of FQDNs to skip the DNS '
- 'check on')
- parser.add_argument('-s', '--skip-path', action='append',
- help='specifies a path to skip when checking URLs')
- parser.add_argument('-v', '--verbose', action='count')
- parser.add_argument('-f', '--file', action='append',
- help=('specifies a file to check;'
- ' all non-specified files are ignored'))
- parser.add_argument('--nointernal', action='store_true',
- help='skips checking of internal references')
- parser.add_argument('--noexternal', action='store_true',
- help='skips checking of external references')
- parser.add_argument('-o', '--output', nargs='?', default=None,
- help='specifies output file for error results')
- parser.add_argument('--no-external-errors', action='store_true',
- help='ignores errors caused by external broken links')
- args = parser.parse_args()
- html_cache_results = {}
- dns_skip = []
- verbose = 0
- output_file = None
- print("Linaro Link Checker")
-
- if args.verbose is not None:
- verbose = args.verbose
- print("Verbosity is at level %s" % verbose)
- if args.skip_dns_check is not None:
- print("Loading FQDN skip list from %s" % args.skip_dns_check)
- try:
- dns_skip = list(open(args.skip_dns_check))
- except Exception as exception:
- print("Couldn't load FQDN skip list")
- if args.output is not None:
- output_file = args.output
- if args.directory is not None:
- print("Scanning '%s'" % args.directory)
- os.chdir(args.directory)
- if args.nointernal:
- print("Skipping internal link checking")
- if args.noexternal:
- print("Skipping external link checking")
- # For now, assume that we're just scanning the current directory. Add code
- # for file paths and possibly URLs at a future date ...
- scan_directory("./", args.skip_path)
+ link_checker = JekyllLinkChecker()