From e067a77370695c7a85b51c40fd966ac396cb51cb Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Wed, 7 Aug 2024 11:38:14 +0200 Subject: [PATCH] Light clean up --- lib/crawler/api/crawl.rb | 5 +++++ lib/crawler/coordinator.rb | 26 ++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/lib/crawler/api/crawl.rb b/lib/crawler/api/crawl.rb index 7b890a9..bca489c 100644 --- a/lib/crawler/api/crawl.rb +++ b/lib/crawler/api/crawl.rb @@ -47,6 +47,7 @@ def initialize(config) delegate :system_logger, :events, :stats, to: :config delegate :rule_engine, to: :sink + #--------------------------------------------------------------------------------------------- def shutdown_started? @shutdown_started.true? end @@ -64,6 +65,7 @@ def start_shutdown!(reason:, allow_resume: false) @shutdown_started.make_true end + #--------------------------------------------------------------------------------------------- # Waits for a specified number of seconds, stopping earlier if we are in a shutdown mode def interruptible_sleep(period) start_time = Time.now @@ -75,10 +77,12 @@ def interruptible_sleep(period) end end + #--------------------------------------------------------------------------------------------- def coordinator @coordinator ||= Crawler::Coordinator.new(self) end + #----------------------------------------------------------------------------------------------- # Starts a new crawl described by the given config. The job is started immediately. def start! # rubocop:disable Metrics/AbcSize events.crawl_start( @@ -106,6 +110,7 @@ def start! # rubocop:disable Metrics/AbcSize end end + #--------------------------------------------------------------------------------------------- # Returns a hash with crawl-specific status information # Note: This is used by the `EventGenerator` class for crawl-status events and by the Crawler Status API. # Please update OpenAPI specs if you add any new fields here. diff --git a/lib/crawler/coordinator.rb b/lib/crawler/coordinator.rb index 19c76ac..aa83078 100644 --- a/lib/crawler/coordinator.rb +++ b/lib/crawler/coordinator.rb @@ -54,6 +54,7 @@ def initialize(crawl) @started_at = Time.now end + #----------------------------------------------------------------------------------------------- # Returns crawl duration in seconds or +nil+ if crawl has not been started yet def crawl_duration started_at ? Time.now - started_at : nil @@ -64,6 +65,7 @@ def active_threads task_executors.length end + #----------------------------------------------------------------------------------------------- def run_crawl! run_primary_crawl! run_purge_crawl! if purge_crawls_allowed? @@ -96,10 +98,6 @@ def run_purge_crawl! # Fetch URLs from docs for pages that were previously indexed but not seen this crawl @purge_backlog = sink.fetch_missing_docs(started_at) - system_logger.info('******') - system_logger.info(@purge_backlog) - system_logger.info('******') - if @purge_backlog.empty? system_logger.info('No documents were found for the purge crawl. Skipping purge crawl.') return @@ -131,6 +129,7 @@ def purge_crawls_allowed? true end + #----------------------------------------------------------------------------------------------- # Communicates the progress on a given crawl task via the system log and Java thread names def crawl_task_progress(crawl_task, message) progress_message = "#{crawl_task.inspect}: #{message}" @@ -138,6 +137,7 @@ def crawl_task_progress(crawl_task, message) system_logger.debug("Crawl task progress: #{progress_message}") end + #----------------------------------------------------------------------------------------------- # Loads robots.txt for each configured domain and registers it def load_robots_txts config.domain_allowlist.each do |domain| @@ -149,6 +149,7 @@ def load_robots_txts end end + #----------------------------------------------------------------------------------------------- # Fetches robots.txt for a given domain and returns it as a crawl result def load_robots_txt(domain) crawl_task = Crawler::Data::CrawlTask.new( @@ -178,6 +179,7 @@ def load_robots_txt(domain) crawl_result end + #----------------------------------------------------------------------------------------------- # Seed the crawler with configured URLs def enqueue_seed_urls system_logger.debug("Seeding the crawl with #{config.seed_urls.size} URLs...") @@ -189,6 +191,7 @@ def enqueue_seed_urls ) end + #----------------------------------------------------------------------------------------------- # Seed the crawler with pre-configured sitemaps def enqueue_sitemaps if config.sitemap_urls.any? @@ -236,16 +239,19 @@ def fetch_valid_auto_discovered_sitemap_urls! end end + #----------------------------------------------------------------------------------------------- def set_outcome(outcome, message) @crawl_results[@crawl_stage][:outcome] = outcome @crawl_results[@crawl_stage][:outcome] = message end + #----------------------------------------------------------------------------------------------- # Returns +true+ if there are any free executors available to run crawl tasks def executors_available? task_executors.length < task_executors.max_length end + #----------------------------------------------------------------------------------------------- # Checks if we should terminate the crawl loop and sets the outcome value accordingly def crawl_finished? return true if @crawl_results[@crawl_stage][:outcome] @@ -301,6 +307,7 @@ def run_crawl_loop log_crawl_end_event end + #----------------------------------------------------------------------------------------------- # Performs a single iteration of the crawl loop def prepare_crawl_task return if shutdown_started? @@ -323,6 +330,7 @@ def prepare_crawl_task end end + #----------------------------------------------------------------------------------------------- def execute_crawl_task(crawl_task) # Fetch the page. crawl_result = execute_task(crawl_task) @@ -335,6 +343,7 @@ def execute_crawl_task(crawl_task) raise end + #----------------------------------------------------------------------------------------------- # Fetches a URL and logs info about the HTTP request/response. def execute_task(crawl_task, follow_redirects: false) crawl_task_progress(crawl_task, 'HTTP execution') @@ -387,6 +396,7 @@ def process_crawl_result(crawl_task, crawl_result) events.url_extracted(**extracted_event) end + #----------------------------------------------------------------------------------------------- # Extracts links from a given crawl result and pushes them into the crawl queue for processing def extract_and_enqueue_links(crawl_task, crawl_result) return if crawl_result.error? || @crawl_stage == CRAWL_STAGE_PURGE @@ -398,6 +408,7 @@ def extract_and_enqueue_links(crawl_task, crawl_result) extract_and_enqueue_sitemap_links(crawl_task, crawl_result) if crawl_result.sitemap? end + #----------------------------------------------------------------------------------------------- def enqueue_redirect_link(crawl_task, crawl_result) add_urls_to_backlog( urls: [crawl_result.location], @@ -409,6 +420,7 @@ def enqueue_redirect_link(crawl_task, crawl_result) ) end + #----------------------------------------------------------------------------------------------- def extract_and_enqueue_html_links(crawl_task, crawl_result) canonical_link = crawl_result.canonical_link if canonical_link @@ -444,6 +456,7 @@ def extract_and_enqueue_html_links(crawl_task, crawl_result) ) end + #----------------------------------------------------------------------------------------------- def extract_and_enqueue_sitemap_links(crawl_task, crawl_result) result = crawl_result.extract_links limit_reached, error = result.values_at(:limit_reached, :error) @@ -472,6 +485,7 @@ def extract_and_enqueue_sitemap_links(crawl_task, crawl_result) end end + #----------------------------------------------------------------------------------------------- def extract_links(crawl_result, crawl_depth:) extracted_links = crawl_result.extract_links(limit: config.max_extracted_links_count) links, limit_reached = extracted_links.values_at(:links, :limit_reached) @@ -499,6 +513,7 @@ def extract_links(crawl_result, crawl_depth:) end end + #----------------------------------------------------------------------------------------------- # Outputs the results of a single URL processing to an output module configured for the crawl def output_crawl_result(crawl_result) retries = 0 @@ -534,6 +549,7 @@ def output_crawl_result(crawl_result) end end + #----------------------------------------------------------------------------------------------- # Adds a set of URLs to the backlog for processing (if they are OK to follow) def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists return unless urls.any? @@ -584,6 +600,7 @@ def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: ni events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST end + #----------------------------------------------------------------------------------------------- # Adds a single url to the backlog for processing and logs an event associated with it # If the queue is full, drops the item on the floor and logs about it. def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, redirect_chain: []) # rubocop:disable Metrics/ParameterLists @@ -618,6 +635,7 @@ def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, red ) end + #----------------------------------------------------------------------------------------------- # Receives a newly-discovered url, makes a decision on what to do with it and records it in the log # FIXME: Feels like we need a generic way of encoding URL decisions, probably in the rules engine def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity