Slight clean up

elastic · Aug 7, 2024 · f0f4b44 · f0f4b44
1 parent b66a7a2
commit f0f4b44
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 4 deletions.
diff --git a/lib/crawler/api/crawl.rb b/lib/crawler/api/crawl.rb
@@ -47,6 +47,7 @@ def initialize(config)
       delegate :system_logger, :events, :stats, to: :config
       delegate :rule_engine, to: :sink
 
+      #---------------------------------------------------------------------------------------------
       def shutdown_started?
         @shutdown_started.true?
       end
@@ -64,6 +65,7 @@ def start_shutdown!(reason:, allow_resume: false)
         @shutdown_started.make_true
       end
 
+      #---------------------------------------------------------------------------------------------
       # Waits for a specified number of seconds, stopping earlier if we are in a shutdown mode
       def interruptible_sleep(period)
         start_time = Time.now
@@ -75,6 +77,7 @@ def interruptible_sleep(period)
         end
       end
 
+      #---------------------------------------------------------------------------------------------
       def coordinator
         @coordinator ||= Crawler::Coordinator.new(self)
       end
@@ -106,6 +109,7 @@ def start! # rubocop:disable Metrics/AbcSize
         end
       end
 
+      #---------------------------------------------------------------------------------------------
       # Returns a hash with crawl-specific status information
       # Note: This is used by the `EventGenerator` class for crawl-status events and by the Crawler Status API.
       #       Please update OpenAPI specs if you add any new fields here.

diff --git a/lib/crawler/coordinator.rb b/lib/crawler/coordinator.rb
@@ -54,6 +54,7 @@ def initialize(crawl)
       @started_at = Time.now
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Returns crawl duration in seconds or +nil+ if crawl has not been started yet
     def crawl_duration
       started_at ? Time.now - started_at : nil
@@ -64,6 +65,7 @@ def active_threads
       task_executors.length
     end
 
+    #-----------------------------------------------------------------------------------------------
     def run_crawl!
       run_primary_crawl!
       run_purge_crawl! if purge_crawls_allowed?
@@ -96,10 +98,6 @@ def run_purge_crawl!
       # Fetch URLs from docs for pages that were previously indexed but not seen this crawl
       @purge_backlog = sink.fetch_missing_docs(started_at)
 
-      system_logger.info('******')
-      system_logger.info(@purge_backlog)
-      system_logger.info('******')
-
       if @purge_backlog.empty?
         system_logger.info('No documents were found for the purge crawl. Skipping purge crawl.')
         return
@@ -131,13 +129,15 @@ def purge_crawls_allowed?
       true
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Communicates the progress on a given crawl task via the system log and Java thread names
     def crawl_task_progress(crawl_task, message)
       progress_message = "#{crawl_task.inspect}: #{message}"
       java.lang.Thread.currentThread.name = progress_message
       system_logger.debug("Crawl task progress: #{progress_message}")
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Loads robots.txt for each configured domain and registers it
     def load_robots_txts
       config.domain_allowlist.each do |domain|
@@ -178,6 +178,7 @@ def load_robots_txt(domain)
       crawl_result
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Seed the crawler with configured URLs
     def enqueue_seed_urls
       system_logger.debug("Seeding the crawl with #{config.seed_urls.size} URLs...")
@@ -189,6 +190,7 @@ def enqueue_seed_urls
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Seed the crawler with pre-configured sitemaps
     def enqueue_sitemaps
       if config.sitemap_urls.any?
@@ -236,6 +238,7 @@ def fetch_valid_auto_discovered_sitemap_urls!
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def set_outcome(outcome, message)
       @crawl_results[@crawl_stage][:outcome] = outcome
       @crawl_results[@crawl_stage][:outcome] = message
@@ -246,6 +249,7 @@ def executors_available?
       task_executors.length < task_executors.max_length
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Checks if we should terminate the crawl loop and sets the outcome value accordingly
     def crawl_finished?
       return true if @crawl_results[@crawl_stage][:outcome]
@@ -301,6 +305,7 @@ def run_crawl_loop
       log_crawl_end_event
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Performs a single iteration of the crawl loop
     def prepare_crawl_task
       return if shutdown_started?
@@ -323,6 +328,7 @@ def prepare_crawl_task
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def execute_crawl_task(crawl_task)
       # Fetch the page.
       crawl_result = execute_task(crawl_task)
@@ -335,6 +341,7 @@ def execute_crawl_task(crawl_task)
       raise
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Fetches a URL and logs info about the HTTP request/response.
     def execute_task(crawl_task, follow_redirects: false)
       crawl_task_progress(crawl_task, 'HTTP execution')
@@ -387,6 +394,7 @@ def process_crawl_result(crawl_task, crawl_result)
       events.url_extracted(**extracted_event)
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Extracts links from a given crawl result and pushes them into the crawl queue for processing
     def extract_and_enqueue_links(crawl_task, crawl_result)
       return if crawl_result.error? || @crawl_stage == CRAWL_STAGE_PURGE
@@ -398,6 +406,7 @@ def extract_and_enqueue_links(crawl_task, crawl_result)
       extract_and_enqueue_sitemap_links(crawl_task, crawl_result) if crawl_result.sitemap?
     end
 
+    #-----------------------------------------------------------------------------------------------
     def enqueue_redirect_link(crawl_task, crawl_result)
       add_urls_to_backlog(
         urls: [crawl_result.location],
@@ -409,6 +418,7 @@ def enqueue_redirect_link(crawl_task, crawl_result)
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_and_enqueue_html_links(crawl_task, crawl_result)
       canonical_link = crawl_result.canonical_link
       if canonical_link
@@ -444,6 +454,7 @@ def extract_and_enqueue_html_links(crawl_task, crawl_result)
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
       result = crawl_result.extract_links
       limit_reached, error = result.values_at(:limit_reached, :error)
@@ -472,6 +483,7 @@ def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     def extract_links(crawl_result, crawl_depth:)
       extracted_links = crawl_result.extract_links(limit: config.max_extracted_links_count)
       links, limit_reached = extracted_links.values_at(:links, :limit_reached)
@@ -499,6 +511,7 @@ def extract_links(crawl_result, crawl_depth:)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Outputs the results of a single URL processing to an output module configured for the crawl
     def output_crawl_result(crawl_result)
       retries = 0
@@ -534,6 +547,7 @@ def output_crawl_result(crawl_result)
       end
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Adds a set of URLs to the backlog for processing (if they are OK to follow)
     def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
       return unless urls.any?
@@ -584,6 +598,7 @@ def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: ni
       events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Adds a single url to the backlog for processing and logs an event associated with it
     # If the queue is full, drops the item on the floor and logs about it.
     def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
@@ -618,6 +633,7 @@ def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, red
       )
     end
 
+    #-----------------------------------------------------------------------------------------------
     # Receives a newly-discovered url, makes a decision on what to do with it and records it in the log
     # FIXME: Feels like we need a generic way of encoding URL decisions, probably in the rules engine
     def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity