-
Notifications
You must be signed in to change notification settings - Fork 525
/
Copy pathlink-checker.rb
300 lines (236 loc) · 8.74 KB
/
link-checker.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# frozen_string_literal: true
# Copyright OpenSearch Contributors
# SPDX-License-Identifier: BSD-3-Clause
require 'net/http'
require 'jekyll/hooks'
require 'jekyll/document'
require 'json'
require 'set'
require 'uri'
require 'pathname'
require 'typhoeus'
require 'ruby-link-checker'
require 'ruby-enum'
##
# This singleton checks links during build to warn or fail upon finding dead links.
#
# `JEKYLL_LINK_CHECKER`, set on the environment, will cause verification of external links
# Valid values: internal, all.
# Usage: `JEKYLL_LINK_CHECKER=internal bundle exec jekyll build --trace`
#
# `JEKYLL_FATAL_LINK_CHECKER`, set on the environment, is the same as `JEKYLL_LINK_CHECKER`
# except that it fails the build if there are broken links. it takes the same valid values
# Usage: `JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --trace`
module Jekyll::LinkChecker
class CheckTypes
include Ruby::Enum
define :INTERNAL, 'internal'
define :EXTERNAL, 'external'
define :ALL, 'all'
end
##
# The collection that will get stores as the output
@urls
##
# Pattern to identify documents that should be excluded based on their URL
@excluded_paths = %r{(\.(css|js|json|map|xml|txt|yml)$|/version-selector\.tpl$)}i.freeze
##
# Pattern to identify certain HTML tags whose content should be excluded from indexing
@href_matcher = /<a[^>]+href=(['"])(.+?)\1/im.freeze
##
# Pattern to check for external URLs
@external_matcher = %r{^https?://}.freeze
##
# List of domains to ignore
# playground.opensearch.org is causing an infinite redirect
# LinkedIn mostly fails with 999 status codes
@ignored_domains = [
'localhost',
'playground.opensearch.org', # inifite redirect, https://github.com/opensearch-project/dashboards-anywhere/issues/172
'crates.io', # 404s on bots
'www.cloudflare.com', # 403s on bots
'platform.openai.com', # 403s on bots
'openai.com', # 403s on bots
'mvnrepository.com', # 403s on bots
'www.intel.com', # 403s on bots
'example.issue.link' # a fake example link from the template
]
##
# Pattern of local paths to ignore
@ignored_paths = %r{(^/javadocs|^mailto:)}.freeze
##
# Holds the list of failures
@failures
##
# Build flags driven by environment variables
@check_internal_links # Enables checking internal links
@check_external_links # Enables checking external links
@fail_on_error # Indicates the need to fail the build for dead links
##
# Defines the priority of the plugin
# The hooks are registered with a very low priority to make sure they runs after any content modifying hook
def self.priority
10
end
def self.check_links?
check_external_links? || check_internal_links?
end
def self.check_external_links?
!!@check_external_links
end
def self.check_internal_links?
!!@check_internal_links
end
def self.fail_on_error?
!!@fail_on_error
end
##
# Initializes the singleton by recording the site
def self.init(site)
@site = site
@urls = {}
@failures = []
begin
@fail_on_error = true if ENV.key?('JEKYLL_FATAL_LINK_CHECKER')
check_flag = fail_on_error? ? ENV['JEKYLL_FATAL_LINK_CHECKER'] : ENV['JEKYLL_LINK_CHECKER']
unless check_flag
return Jekyll.logger.info 'LinkChecker:', 'disabled. Enable with JEKYLL_LINK_CHECKER on the environment'
end
unless CheckTypes.values.include?(check_flag)
Jekyll.logger.info "LinkChecker: [Notice] Could not initialize, Valid values for #{fail_on_error? ? 'JEKYLL_FATAL_LINK_CHECKER' : 'JEKYLL_LINK_CHECKER'} are #{CheckTypes.values}"
return
end
@external_link_checker = LinkChecker::Typhoeus::Hydra::Checker.new(
logger: Jekyll.logger,
hydra: { max_concurrency: 2 },
retries: 3,
user_agent: 'OpenSearch Documentation Website Link Checker/1.0'
)
@external_link_checker.on :failure, :error do |result|
@failures << "#{result}, linked to in #{result.options[:location]}"
end
@check_external_links = [CheckTypes::EXTERNAL, CheckTypes::ALL].include?(check_flag)
@check_internal_links = [CheckTypes::INTERNAL, CheckTypes::ALL].include?(check_flag)
# Process a Page as soon as its content is ready
Jekyll::Hooks.register :pages, :post_convert, priority: priority do |page|
process(page)
end
# Process a Document as soon as its content is ready
Jekyll::Hooks.register :documents, :post_convert, priority: priority do |document|
process(document)
end
# Verify gathered links after Jekyll is done writing all its stuff
Jekyll::Hooks.register :site, :post_write, priority: priority do |site|
verify(site)
end
if check_links?
Jekyll.logger.info "LinkChecker: [Notice] Initialized successfully and will check #{check_flag} links"
end
Jekyll.logger.info 'LinkChecker: [Notice] The build will fail if a dead link is found' if fail_on_error?
rescue StandardError => e
Jekyll.logger.error 'LinkChecker: [Error] Failed to initialize Link Checker'
raise
end
end
##
# Processes a Document or Page and adds the links to a collection
# It also checks for anchors to parts of the same page/doc
def self.process(page)
return unless check_links?
return if @excluded_paths.match(page.path)
hrefs = page.content.scan(@href_matcher)
hrefs.each do |(_, href)|
relative_path = page.path[0] == '/' ? Pathname.new(page.path).relative_path_from(Dir.getwd) : page.path
if href.eql? '#'
next
elsif href.start_with? '#'
Jekyll.logger.info relative_path if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil?
if (page.content =~ /<[a-z0-9-]+[^>]+(?:id|name)="#{href[1..]}"/i).nil?
@failures << "##{href[1..]}, linked in ./#{relative_path}"
end
else
@urls[href] = Set[] unless @urls.key?(href)
@urls[href] << relative_path
end
end
end
##
# Saves the collection as a JSON file
def self.verify(_site)
return unless check_links?
@base_url_matcher = %r{^#{@site.config["url"]}#{@site.baseurl}(/.*)$}.freeze
@urls.sort_by { |_url, _pages| rand }.each do |url, pages|
location = "./#{pages.to_a.join(', ./')}"
@failures << "#{url}, linked to in #{location}" unless check(url, location)
end
@external_link_checker.run
unless @failures.empty?
msg = "Found #{@failures.size} dead link#{@failures.size > 1 ? 's' : ''}:\n#{@failures.join("\n")}"
end
if !@failures.empty?
if fail_on_error?
Jekyll.logger.error "\nLinkChecker: [Error] #{msg}\n".red
raise msg
else
Jekyll.logger.warn "\nLinkChecker: [Warning] #{msg}\n".red
end
else
Jekyll.logger.info "\nLinkChecker: [Success] No broken links!\n".green
end
end
##
# Check if URL is accessible
def self.check(url, location)
match = @base_url_matcher.match(url)
url = match[1] unless match.nil?
url = @site.config['url'] + url if url.start_with? '/docs/'
if @external_matcher =~ url
return true unless check_external_links?
check_external(url, location)
else
return true unless check_internal_links?
check_internal(url, location)
end
end
##
# Check if an external URL is accessible
def self.check_external(url, location)
url = begin
URI(url)
rescue StandardError
url
end
return true if url.is_a?(URI) && @ignored_domains.include?(url.host)
@external_link_checker.check(url, { location: location })
end
##
# Check if an internal link is accessible
def self.check_internal(url, location)
Jekyll.logger.info "LinkChecker: [Info] Checking #{url} (#{location})".cyan
return true if @ignored_paths =~ url
path, hash = url.split('#')
unless path =~ %r{\.[^/]{2,}$}
path << '/' unless path.end_with? '/'
path << 'index.html' unless path.end_with? 'index.html'
end
filename = File.join(@site.config['destination'], path)
return false unless File.file?(filename)
content = File.read(filename)
unless content.include? '<title>Redirecting'
return true if hash.nil? || hash.empty?
return !(content =~ /<[a-z0-9-]+[^>]+id="#{hash}"/i).nil?
end
match = content.match(@href_matcher)
if match.nil?
Jekyll.logger.warn "LinkChecker: [Warning] Cannot check #{url} due to an unfollowable redirect"
return true
end
redirect = match[2]
redirect << '#' + hash unless hash.nil? || hash.empty?
check(redirect, location)
end
end
# Before any Document or Page is processed, initialize the LinkChecker
Jekyll::Hooks.register :site, :pre_render, priority: Jekyll::LinkChecker.priority do |site|
Jekyll::LinkChecker.init(site)
end