From ce0e2715d79e1d12c80935276f7b39b81ee2a00d Mon Sep 17 00:00:00 2001 From: Bruce Bolt Date: Wed, 17 Jul 2024 11:51:51 +0100 Subject: [PATCH] Specifically report where Cloudflare block our requests Some linked websites use [Cloudflare Browser Integrity Check](https://developers.cloudflare.com/waf/tools/browser-integrity-check/) to stop automated robots from accessing their site. This works by responding with a 403 status code and a payload that contains some JavaScript. The JavaScript carries out some checks before reloading the page with a 200 status code. Our crawler is unable to interact with the JavaScript, so Cloudflare block our access to the site. In order to stop users thinking these links are actually broken, this change reports a different error when we are blocked by Cloudflare. --- app/lib/link_checker/uri_checker/http_checker.rb | 12 +++++++++++- app/lib/link_checker/uri_checker/problem.rb | 1 + config/locales/en.yml | 5 +++++ spec/lib/link_checker_spec.rb | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/app/lib/link_checker/uri_checker/http_checker.rb b/app/lib/link_checker/uri_checker/http_checker.rb index c4558320..50b52052 100644 --- a/app/lib/link_checker/uri_checker/http_checker.rb +++ b/app/lib/link_checker/uri_checker/http_checker.rb @@ -53,6 +53,12 @@ def initialize(options = {}) end end + class PageBlocksBots < Error + def initialize(options = {}) + super(summary: :page_blocks_bots, message: :page_blocked_bots, **options) + end + end + class PageRequiresLogin < Error def initialize(options = {}) super(summary: :page_requires_login, message: :login_required_to_view, **options) @@ -172,7 +178,11 @@ def check_request if response.status == 404 || response.status == 410 add_problem(PageNotFound.new(from_redirect: from_redirect?)) elsif response.status == 401 || response.status == 403 - add_problem(PageRequiresLogin.new(from_redirect: from_redirect?)) + if response.headers["cf-mitigated"] == "challenge" + add_problem(PageBlocksBots.new(from_redirect: from_redirect?)) + else + add_problem(PageRequiresLogin.new(from_redirect: from_redirect?)) + end elsif response.status >= 400 && response.status < 500 add_problem(PageIsUnavailable.new(from_redirect: from_redirect?, status: response.status)) elsif response.status >= 500 && response.status < 600 diff --git a/app/lib/link_checker/uri_checker/problem.rb b/app/lib/link_checker/uri_checker/problem.rb index c32072c6..29bf4158 100644 --- a/app/lib/link_checker/uri_checker/problem.rb +++ b/app/lib/link_checker/uri_checker/problem.rb @@ -47,6 +47,7 @@ def get_string(symbol) NoHost HttpCommunicationError PageNotFound + PageBlocksBots PageRequiresLogin PageIsUnavailable PageRespondsWithError diff --git a/config/locales/en.yml b/config/locales/en.yml index a408432e..bab6c1e8 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -64,6 +64,11 @@ en: redirect: This redirects to a page not found (404). find_content_now: Find where the content is now hosted and link to that instead. + page_blocks_bots: Page blocks robots + page_blocked_bots: + singular: Our link checker was blocked from accessing the website. + redirect: This redirects to a page that blocked our link checker from accessing the website. + page_requires_login: Page requires login login_required_to_view: singular: A login is required to view this page. diff --git a/spec/lib/link_checker_spec.rb b/spec/lib/link_checker_spec.rb index 6cfee183..2235294d 100644 --- a/spec/lib/link_checker_spec.rb +++ b/spec/lib/link_checker_spec.rb @@ -207,6 +207,20 @@ include_examples "has no warnings" end + context "403 status code with Cloudflare challenge header" do + let(:uri) { "http://www.not-gov.uk/403" } + before do + stub_request(:get, uri).to_return( + status: 403, + headers: { + "cf-mitigated" => "challenge", + }, + ) + end + include_examples "has errors", "Our link checker was blocked from accessing the website." + include_examples "has no warnings" + end + context "403 status code without Cloudflare challenge header" do let(:uri) { "http://www.not-gov.uk/403" } before { stub_request(:get, uri).to_return(status: 403) }