Enable debug logs and toggle event logs (#38)

During QA @jedrazb noted that the logs are noisy and unhelpful, and that debug logs are always enabled. While system_logger was accepting `info` and `debug`, it wasn't actually differentiating those in the output. Also event logging is _extremely_ noisy and should be disabled for the shell by default. - Add log levels to system logger with `info` as default level - Allow system log level to be set in config file - Allow event log output to be enabled/disabled in config file - Misc changes to existing log levels
elastic · May 31, 2024 · 741f353 · 741f353
1 parent 408b118
commit 741f353
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 28 deletions.
diff --git a/config/crawler.yml.example b/config/crawler.yml.example
@@ -1,8 +1,19 @@
-## Domains allowed for the crawl
+## ================== Crawler Configuration - Elasticsearch ====================
+#
+##  Crawler configuration settings. One configuration file can be used to
+##       define one crawler/crawl job
+#
+##  NOTE: Most Crawler configurations comes with reasonable defaults.
+##       Before adjusting the configuration, make sure you understand what you
+##       are trying to accomplish and the consequences.
+#
+## ------------------------------- Crawler ------------------------------------
+#
+## The domain(s) that Crawler will crawl.
 #domain_allowlist:
 #  - http://localhost:8000
 #
-## URLs used to seed the crawl
+## The URLs used to seed the crawl. Must have the same host(s) as `domain_allowlist`
 #seed_urls:
 #  - http://localhost:8000
 #
@@ -15,7 +26,7 @@
 ## Local directory to output crawl results. Required if output_sink is file
 #output_dir: output/local-site
 #
-## Crawl tuning
+## The maximum depth that Crawler will follow links to.
 #max_crawl_depth: 2
 #
 ## Crawl result field size limits
@@ -26,7 +37,9 @@
 #max_indexed_links_count: 10
 #max_headings_count: 10
 #
-## Enable local proxy
+## ------------------------------- Crawler - Advanced --------------------------
+#
+## Proxy configurations.
 #http_proxy_host: localhost
 #http_proxy_port: 8888
 #http_proxy_protocol: http
@@ -35,15 +48,14 @@
 #loopback_allowed: true
 #ssl_verification_mode: none
 #
-## Enable auth
-#auth:
-#  -
-#  domain: https://parksaustralia.gov.au
-#  type: basic
-#  username: user
-#  password: pass
+## Authorization configurations. Only required if a site has some form of authorization.
+#auth.domain: https://parksaustralia.gov.au
+#auth.type: basic
+#auth.username: user
+#auth.password: pass
 #
-## Enable content extraction (from files)
+## Whether document metadata from certain content types will be indexed or not.
+##     This does not allow binary content to be indexed from these files, only metadata.
 #content_extraction_enabled: true
 #content_extraction_mime_types:
 #  - application/pdf
@@ -52,7 +64,21 @@
 #  - application/vnd.ms-powerpoint
 #  - application/vnd.openxmlformats-officedocument.presentationml.presentation
 #
-## Elasticsearch connection settings. These can be defined for all crawlers in `config/elasticsearch.yml`,
+## ------------------------------- Logging -------------------------------------
+#
+# The log level for system logs. Defaults to `info`
+#log_level: info
+#
+# Whether or not event logging is enabled for output to the shell running Crawler.
+#     Event logs are incredibly noisy but have a lot of granularity, these can
+#     be useful for debugging failing Crawlers.
+#     Defaults to `false`
+#event_logs: false
+#
+## ------------------------------- Elasticsearch -------------------------------
+#
+## Elasticsearch connection settings for this specific crawler/crawl job.
+##     See elasticsearch.yml.example for detailed configurations.
 ##
 #elasticsearch:
 #  host: http://localhost

diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb
@@ -22,7 +22,18 @@
 module Crawler
   module API
     class Config # rubocop:disable Metrics/ClassLength
+      LOG_LEVELS = {
+        debug: Logger::DEBUG,
+        info: Logger::INFO,
+        warn: Logger::WARN,
+        error: Logger::ERROR,
+        fatal: Logger::FATAL
+      }.stringify_keys.freeze
+
       CONFIG_FIELDS = [
+        :log_level,            # Log level set in config file, defaults to `info`
+        :event_logs,           # Whether event logs are output to the shell, defaults to `false`
+
         :crawl_id,             # Unique identifier of the crawl (used in logs, etc)
         :crawl_stage,          # Stage name for multi-stage crawls
 
@@ -111,6 +122,9 @@ class Config # rubocop:disable Metrics/ClassLength
       # and in the `Crawler::HttpUtils::Config` class.
       # Make sure to check those before renaming or removing any defaults.
       DEFAULTS = {
+        log_level: 'info',
+        event_logs: false,
+
         crawl_stage: :primary,
 
         sitemap_urls: [],
@@ -192,7 +206,7 @@ def initialize(params = {})
         configure_crawl_id!
 
         # Setup logging for free-text and structured events
-        configure_logging!
+        configure_logging!(params[:log_level], params[:event_logs])
 
         # Normalize and validate parameters
         confugure_ssl_ca_certificates!
@@ -315,12 +329,15 @@ def configure_sitemap_urls!
       end
 
       #---------------------------------------------------------------------------------------------
-      def configure_logging!
-        @event_logger = Logger.new($stdout)
+      def configure_logging!(log_level, event_logs_enabled)
+        @event_logger = Logger.new($stdout) if event_logs_enabled
+
+        system_logger = Logger.new($stdout)
+        system_logger.level = LOG_LEVELS[log_level]
 
         # Add crawl id and stage to all logging events produced by this crawl
-        base_system_logger = StaticallyTaggedLogger.new(Logger.new($stdout))
-        @system_logger = base_system_logger.tagged("crawl:#{crawl_id}", crawl_stage)
+        tagged_system_logger = StaticallyTaggedLogger.new(system_logger)
+        @system_logger = tagged_system_logger.tagged("crawl:#{crawl_id}", crawl_stage)
       end
 
       #---------------------------------------------------------------------------------------------
@@ -344,7 +361,7 @@ def document_mapper
       # Receives a crawler event object and outputs it into relevant systems
       def output_event(event)
         # Log the event
-        event_logger << "#{event.to_json}\n"
+        event_logger << "#{event.to_json}\n" if event_logger
 
         # Count stats for the crawl
         stats.update_from_event(event)

diff --git a/lib/crawler/coordinator.rb b/lib/crawler/coordinator.rb
@@ -131,7 +131,7 @@ def load_robots_txt(domain)
       elsif crawl_result.error?
         system_logger.warn("Error while fetching robots.txt for #{domain}: #{crawl_result.error}")
       else
-        system_logger.info("Fetched robots.txt for #{domain} from '#{crawl_result.url}'")
+        system_logger.debug("Fetched robots.txt for #{domain} from '#{crawl_result.url}'")
       end
 
       crawl_result
@@ -140,7 +140,7 @@ def load_robots_txt(domain)
     #-----------------------------------------------------------------------------------------------
     # Seed the crawler with configured URLs
     def enqueue_seed_urls
-      system_logger.info("Seeding the crawl with #{config.seed_urls.size} URLs...")
+      system_logger.debug("Seeding the crawl with #{config.seed_urls.size} URLs...")
       add_urls_to_backlog(
         urls: config.seed_urls,
         type: :content,
@@ -153,7 +153,7 @@ def enqueue_seed_urls
     # Seed the crawler with pre-configured sitemaps
     def enqueue_sitemaps
       if config.sitemap_urls.any?
-        system_logger.info("Seeding the crawl with #{config.sitemap_urls.count} Sitemap URLs...")
+        system_logger.debug("Seeding the crawl with #{config.sitemap_urls.count} Sitemap URLs...")
         add_urls_to_backlog(
           urls: config.sitemap_urls,
           type: :sitemap,
@@ -167,7 +167,7 @@ def enqueue_sitemaps
       valid_auto_discovered_sitemap_urls = fetch_valid_auto_discovered_sitemap_urls!
       return unless valid_auto_discovered_sitemap_urls.any?
 
-      system_logger.info(
+      system_logger.debug(
         "Seeding the crawl with #{valid_auto_discovered_sitemap_urls.count} " \
         'auto-discovered (via robots.txt) Sitemap URLs...'
       )
@@ -511,7 +511,7 @@ def add_urls_to_backlog(urls:, type:, source_type:, crawl_depth:, source_url: ni
       # Seeding complete, log about it
       return unless added_urls_count.positive?
 
-      system_logger.info("Added #{added_urls_count} URLs from a #{source_type} source to the queue...")
+      system_logger.debug("Added #{added_urls_count} URLs from a #{source_type} source to the queue...")
       events.crawl_seed(added_urls_count, type: :content) if source_type == SEED_LIST
     end
 

diff --git a/lib/crawler/event_generator.rb b/lib/crawler/event_generator.rb
@@ -61,7 +61,7 @@ def crawl_start(url_queue_items:, seen_urls:)
         else
           'Starting a crawl'
         end
-      system_logger.info("#{action} with the following configuration: #{config}")
+      system_logger.debug("#{action} with the following configuration: #{config}")
 
       log_crawl_event(
         'event.type' => 'start',
@@ -72,7 +72,7 @@ def crawl_start(url_queue_items:, seen_urls:)
     end
 
     def crawl_end(outcome:, message:, resume_possible:)
-      system_logger.info("Finished a crawl. Result: #{outcome}: #{message}")
+      system_logger.info("Finished a crawl. Result: #{outcome}; #{message}")
       log_crawl_event(
         'event.type' => 'end',
         'event.action' => 'crawl-end',
@@ -127,7 +127,7 @@ def crawl_status_for_system_log(status)
     # URL Life-cycle Events
     #-----------------------------------------------------------------------------------------------
     def url_seed(url:, source_url:, type:, crawl_depth:, source_type:)
-      system_logger.info(
+      system_logger.debug(
         "Added a new URL to the crawl queue: '#{url}' (type: #{type}, source: #{source_type}, depth: #{crawl_depth})"
       )
       log_url_event(
@@ -146,7 +146,7 @@ def url_seed(url:, source_url:, type:, crawl_depth:, source_type:)
     def url_fetch(url:, crawl_result:, auth_type: nil) # rubocop:disable Metrics/AbcSize
       status_code = crawl_result.status_code
       outcome = outcome_from_status_code(status_code)
-      system_logger.info("Fetched a page '#{url}' with a status code #{status_code} and an outcome of '#{outcome}'")
+      system_logger.debug("Fetched a page '#{url}' with a status code #{status_code} and an outcome of '#{outcome}'")
 
       event = {
         'crawler.url.auth.type' => auth_type,