From e067a77370695c7a85b51c40fd966ac396cb51cb Mon Sep 17 00:00:00 2001
From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
Date: Wed, 7 Aug 2024 11:38:14 +0200
Subject: [PATCH] Light clean up

---
 lib/crawler/api/crawl.rb   |  5 +++++
 lib/crawler/coordinator.rb | 26 ++++++++++++++++++++++----
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/lib/crawler/api/crawl.rb b/lib/crawler/api/crawl.rb
index 7b890a9..bca489c 100644
--- a/lib/crawler/api/crawl.rb
+++ b/lib/crawler/api/crawl.rb
@@ -47,6 +47,7 @@ def initialize(config)
       delegate :system_logger, :events, :stats, to: :config
       delegate :rule_engine, to: :sink
 
+      #---------------------------------------------------------------------------------------------
       def shutdown_started?
         @shutdown_started.true?
       end
@@ -64,6 +65,7 @@ def start_shutdown!(reason:, allow_resume: false)
         @shutdown_started.make_true
       end
 
+      #---------------------------------------------------------------------------------------------
       # Waits for a specified number of seconds, stopping earlier if we are in a shutdown mode
       def interruptible_sleep(period)
         start_time = Time.now
@@ -75,10 +77,12 @@ def interruptible_sleep(period)
         end
       end
 
+      #---------------------------------------------------------------------------------------------
       def coordinator
         @coordinator ||= Crawler::Coordinator.new(self)
       end
 
+      #-----------------------------------------------------------------------------------------------
       # Starts a new crawl described by the given config. The job is started immediately.
       def start! # rubocop:disable Metrics/AbcSize
         events.crawl_start(
@@ -106,6 +110,7 @@ def start! # rubocop:disable Metrics/AbcSize
         end
       end
 
+      #---------------------------------------------------------------------------------------------
       # Returns a hash with crawl-specific status information
       # Note: This is used by the `EventGenerator` class for crawl-status events and by the Crawler Status API.
       #       Please update OpenAPI specs if you add any new fields here.
diff --git a/lib/crawler/coordinator.rb b/lib/crawler/coordinator.rb
index 19c76ac..aa83078 100644
--- a/lib/crawler/coordinator.rb
+++ b/lib/crawler/coordinator.rb
@@ -54,6 +54,7 @@ def initialize(crawl)
       @started_at = Time.now
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Returns crawl duration in seconds or +nil+ if crawl has not been started yet
     def crawl_duration
       started_at ? Time.now - started_at : nil
@@ -64,6 +65,7 @@ def active_threads
       task_executors.length
     end
 
+    #-----------------------------------------------------------------------------------------------
     def run_crawl!
       run_primary_crawl!
       run_purge_crawl! if purge_crawls_allowed?
@@ -96,10 +98,6 @@ def run_purge_crawl!
       # Fetch URLs from docs for pages that were previously indexed but not seen this crawl
       @purge_backlog = sink.fetch_missing_docs(started_at)
 
-      system_logger.info('******')
-      system_logger.info(@purge_backlog)
-      system_logger.info('******')
-
       if @purge_backlog.empty?
         system_logger.info('No documents were found for the purge crawl. Skipping purge crawl.')
         return
@@ -131,6 +129,7 @@ def purge_crawls_allowed?
       true
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Communicates the progress on a given crawl task via the system log and Java thread names
     def crawl_task_progress(crawl_task, message)
       progress_message = "#{crawl_task.inspect}: #{message}"
@@ -138,6 +137,7 @@ def crawl_task_progress(crawl_task, message)
       system_logger.debug("Crawl task progress: #{progress_message}")
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Loads robots.txt for each configured domain and registers it
     def load_robots_txts
       config.domain_allowlist.each do |domain|
@@ -149,6 +149,7 @@ def load_robots_txts
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Fetches robots.txt for a given domain and returns it as a crawl result
     def load_robots_txt(domain)
       crawl_task = Crawler::Data::CrawlTask.new(
@@ -178,6 +179,7 @@ def load_robots_txt(domain)
       crawl_result
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Seed the crawler with configured URLs
     def enqueue_seed_urls
       system_logger.debug("Seeding the crawl with #{config.seed_urls.size} URLs...")
@@ -189,6 +191,7 @@ def enqueue_seed_urls
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Seed the crawler with pre-configured sitemaps
     def enqueue_sitemaps
       if config.sitemap_urls.any?
@@ -236,16 +239,19 @@ def fetch_valid_auto_discovered_sitemap_urls!
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def set_outcome(outcome, message)
       @crawl_results[@crawl_stage][:outcome] = outcome
       @crawl_results[@crawl_stage][:outcome] = message
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Returns +true+ if there are any free executors available to run crawl tasks
     def executors_available?
       task_executors.length < task_executors.max_length
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Checks if we should terminate the crawl loop and sets the outcome value accordingly
     def crawl_finished?
       return true if @crawl_results[@crawl_stage][:outcome]
@@ -301,6 +307,7 @@ def run_crawl_loop
       log_crawl_end_event
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Performs a single iteration of the crawl loop
     def prepare_crawl_task
       return if shutdown_started?
@@ -323,6 +330,7 @@ def prepare_crawl_task
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def execute_crawl_task(crawl_task)
       # Fetch the page.
       crawl_result = execute_task(crawl_task)
@@ -335,6 +343,7 @@ def execute_crawl_task(crawl_task)
       raise
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Fetches a URL and logs info about the HTTP request/response.
     def execute_task(crawl_task, follow_redirects: false)
       crawl_task_progress(crawl_task, 'HTTP execution')
@@ -387,6 +396,7 @@ def process_crawl_result(crawl_task, crawl_result)
       events.url_extracted(**extracted_event)
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Extracts links from a given crawl result and pushes them into the crawl queue for processing
     def extract_and_enqueue_links(crawl_task, crawl_result)
       return if crawl_result.error? || @crawl_stage == CRAWL_STAGE_PURGE
@@ -398,6 +408,7 @@ def extract_and_enqueue_links(crawl_task, crawl_result)
       extract_and_enqueue_sitemap_links(crawl_task, crawl_result) if crawl_result.sitemap?
     end
 
+    #-----------------------------------------------------------------------------------------------
     def enqueue_redirect_link(crawl_task, crawl_result)
       add_urls_to_backlog(
         urls: [crawl_result.location],
@@ -409,6 +420,7 @@ def enqueue_redirect_link(crawl_task, crawl_result)
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_and_enqueue_html_links(crawl_task, crawl_result)
       canonical_link = crawl_result.canonical_link
       if canonical_link
@@ -444,6 +456,7 @@ def extract_and_enqueue_html_links(crawl_task, crawl_result)
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
       result = crawl_result.extract_links
       limit_reached, error = result.values_at(:limit_reached, :error)
@@ -472,6 +485,7 @@ def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_links(crawl_result, crawl_depth:)
       extracted_links = crawl_result.extract_links(limit: config.max_extracted_links_count)
       links, limit_reached = extracted_links.values_at(:links, :limit_reached)
@@ -499,6 +513,7 @@ def extract_links(crawl_result, crawl_depth:)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Outputs the results of a single URL processing to an output module configured for the crawl
     def output_crawl_result(crawl_result)
       retries = 0
@@ -534,6 +549,7 @@ def output_crawl_result(crawl_result)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Adds a set of URLs to the backlog for processing (if they are OK to follow)
     def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
       return unless urls.any?
@@ -584,6 +600,7 @@ def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: ni
       events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Adds a single url to the backlog for processing and logs an event associated with it
     # If the queue is full, drops the item on the floor and logs about it.
     def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
@@ -618,6 +635,7 @@ def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, red
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Receives a newly-discovered url, makes a decision on what to do with it and records it in the log
     # FIXME: Feels like we need a generic way of encoding URL decisions, probably in the rules engine
     def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity