Skip to content

Commit

Permalink
Slight clean up
Browse files Browse the repository at this point in the history
  • Loading branch information
navarone-feekery committed Aug 7, 2024
1 parent b66a7a2 commit f0f4b44
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 4 deletions.
4 changes: 4 additions & 0 deletions lib/crawler/api/crawl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def initialize(config)
delegate :system_logger, :events, :stats, to: :config
delegate :rule_engine, to: :sink

#---------------------------------------------------------------------------------------------
def shutdown_started?
@shutdown_started.true?
end
Expand All @@ -64,6 +65,7 @@ def start_shutdown!(reason:, allow_resume: false)
@shutdown_started.make_true
end

#---------------------------------------------------------------------------------------------
# Waits for a specified number of seconds, stopping earlier if we are in a shutdown mode
def interruptible_sleep(period)
start_time = Time.now
Expand All @@ -75,6 +77,7 @@ def interruptible_sleep(period)
end
end

#---------------------------------------------------------------------------------------------
def coordinator
@coordinator ||= Crawler::Coordinator.new(self)
end
Expand Down Expand Up @@ -106,6 +109,7 @@ def start! # rubocop:disable Metrics/AbcSize
end
end

#---------------------------------------------------------------------------------------------
# Returns a hash with crawl-specific status information
# Note: This is used by the `EventGenerator` class for crawl-status events and by the Crawler Status API.
# Please update OpenAPI specs if you add any new fields here.
Expand Down
24 changes: 20 additions & 4 deletions lib/crawler/coordinator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def initialize(crawl)
@started_at = Time.now
end

#-----------------------------------------------------------------------------------------------
# Returns crawl duration in seconds or +nil+ if crawl has not been started yet
def crawl_duration
started_at ? Time.now - started_at : nil
Expand All @@ -64,6 +65,7 @@ def active_threads
task_executors.length
end

#-----------------------------------------------------------------------------------------------
def run_crawl!
run_primary_crawl!
run_purge_crawl! if purge_crawls_allowed?
Expand Down Expand Up @@ -96,10 +98,6 @@ def run_purge_crawl!
# Fetch URLs from docs for pages that were previously indexed but not seen this crawl
@purge_backlog = sink.fetch_missing_docs(started_at)

system_logger.info('******')
system_logger.info(@purge_backlog)
system_logger.info('******')

if @purge_backlog.empty?
system_logger.info('No documents were found for the purge crawl. Skipping purge crawl.')
return
Expand Down Expand Up @@ -131,13 +129,15 @@ def purge_crawls_allowed?
true
end

#-----------------------------------------------------------------------------------------------
# Communicates the progress on a given crawl task via the system log and Java thread names
def crawl_task_progress(crawl_task, message)
progress_message = "#{crawl_task.inspect}: #{message}"
java.lang.Thread.currentThread.name = progress_message
system_logger.debug("Crawl task progress: #{progress_message}")
end

#-----------------------------------------------------------------------------------------------
# Loads robots.txt for each configured domain and registers it
def load_robots_txts
config.domain_allowlist.each do |domain|
Expand Down Expand Up @@ -178,6 +178,7 @@ def load_robots_txt(domain)
crawl_result
end

#-----------------------------------------------------------------------------------------------
# Seed the crawler with configured URLs
def enqueue_seed_urls
system_logger.debug("Seeding the crawl with #{config.seed_urls.size} URLs...")
Expand All @@ -189,6 +190,7 @@ def enqueue_seed_urls
)
end

#-----------------------------------------------------------------------------------------------
# Seed the crawler with pre-configured sitemaps
def enqueue_sitemaps
if config.sitemap_urls.any?
Expand Down Expand Up @@ -236,6 +238,7 @@ def fetch_valid_auto_discovered_sitemap_urls!
end
end

#-----------------------------------------------------------------------------------------------
def set_outcome(outcome, message)
@crawl_results[@crawl_stage][:outcome] = outcome
@crawl_results[@crawl_stage][:outcome] = message
Expand All @@ -246,6 +249,7 @@ def executors_available?
task_executors.length < task_executors.max_length
end

#-----------------------------------------------------------------------------------------------
# Checks if we should terminate the crawl loop and sets the outcome value accordingly
def crawl_finished?
return true if @crawl_results[@crawl_stage][:outcome]
Expand Down Expand Up @@ -301,6 +305,7 @@ def run_crawl_loop
log_crawl_end_event
end

#-----------------------------------------------------------------------------------------------
# Performs a single iteration of the crawl loop
def prepare_crawl_task
return if shutdown_started?
Expand All @@ -323,6 +328,7 @@ def prepare_crawl_task
end
end

#-----------------------------------------------------------------------------------------------
def execute_crawl_task(crawl_task)
# Fetch the page.
crawl_result = execute_task(crawl_task)
Expand All @@ -335,6 +341,7 @@ def execute_crawl_task(crawl_task)
raise
end

#-----------------------------------------------------------------------------------------------
# Fetches a URL and logs info about the HTTP request/response.
def execute_task(crawl_task, follow_redirects: false)
crawl_task_progress(crawl_task, 'HTTP execution')
Expand Down Expand Up @@ -387,6 +394,7 @@ def process_crawl_result(crawl_task, crawl_result)
events.url_extracted(**extracted_event)
end

#-----------------------------------------------------------------------------------------------
# Extracts links from a given crawl result and pushes them into the crawl queue for processing
def extract_and_enqueue_links(crawl_task, crawl_result)
return if crawl_result.error? || @crawl_stage == CRAWL_STAGE_PURGE
Expand All @@ -398,6 +406,7 @@ def extract_and_enqueue_links(crawl_task, crawl_result)
extract_and_enqueue_sitemap_links(crawl_task, crawl_result) if crawl_result.sitemap?
end

#-----------------------------------------------------------------------------------------------
def enqueue_redirect_link(crawl_task, crawl_result)
add_urls_to_backlog(
urls: [crawl_result.location],
Expand All @@ -409,6 +418,7 @@ def enqueue_redirect_link(crawl_task, crawl_result)
)
end

#-----------------------------------------------------------------------------------------------
def extract_and_enqueue_html_links(crawl_task, crawl_result)
canonical_link = crawl_result.canonical_link
if canonical_link
Expand Down Expand Up @@ -444,6 +454,7 @@ def extract_and_enqueue_html_links(crawl_task, crawl_result)
)
end

#-----------------------------------------------------------------------------------------------
def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
result = crawl_result.extract_links
limit_reached, error = result.values_at(:limit_reached, :error)
Expand Down Expand Up @@ -472,6 +483,7 @@ def extract_and_enqueue_sitemap_links(crawl_task, crawl_result)
end
end

#-----------------------------------------------------------------------------------------------
def extract_links(crawl_result, crawl_depth:)
extracted_links = crawl_result.extract_links(limit: config.max_extracted_links_count)
links, limit_reached = extracted_links.values_at(:links, :limit_reached)
Expand Down Expand Up @@ -499,6 +511,7 @@ def extract_links(crawl_result, crawl_depth:)
end
end

#-----------------------------------------------------------------------------------------------
# Outputs the results of a single URL processing to an output module configured for the crawl
def output_crawl_result(crawl_result)
retries = 0
Expand Down Expand Up @@ -534,6 +547,7 @@ def output_crawl_result(crawl_result)
end
end

#-----------------------------------------------------------------------------------------------
# Adds a set of URLs to the backlog for processing (if they are OK to follow)
def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: nil, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
return unless urls.any?
Expand Down Expand Up @@ -584,6 +598,7 @@ def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: ni
events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
end

#-----------------------------------------------------------------------------------------------
# Adds a single url to the backlog for processing and logs an event associated with it
# If the queue is full, drops the item on the floor and logs about it.
def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, redirect_chain: []) # rubocop:disable Metrics/ParameterLists
Expand Down Expand Up @@ -618,6 +633,7 @@ def add_url_to_backlog(url:, type:, source_type:, crawl_depth:, source_url:, red
)
end

#-----------------------------------------------------------------------------------------------
# Receives a newly-discovered url, makes a decision on what to do with it and records it in the log
# FIXME: Feels like we need a generic way of encoding URL decisions, probably in the rules engine
def check_discovered_url(url:, type:, source_url:, crawl_depth:) # rubocop:disable Metrics/PerceivedComplexity
Expand Down

0 comments on commit f0f4b44

Please sign in to comment.