elastic · navarone-feekery · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025
@@ -54,6 +54,13 @@
 ## The maximum depth that Crawler will follow links to.
 #max_crawl_depth: 2
 #
+## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
+#purge_crawl_enabled: true
+#
+## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
+##   dramatically increase the index size if the site being crawled is large. Defaults to false.
+#full_html_extraction_enabled: false
+#
 ## Scheduling using cron expressions
 #schedule:
 #  pattern: "0 12 * * *"     # every day at noon

@@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength
         :results_collection,   # An Enumerable collection for storing mock crawl results
         :user_agent,           # The User-Agent used for requests made from the crawler.
         :stats_dump_interval,  # How often should we output stats in the logs during a crawl
-        :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :purge_crawl_enabled,  # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON
 
         # Elasticsearch settings
         :elasticsearch, # Elasticsearch connection settings
@@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength
 
         extraction_rules: {},
         crawl_rules: {},
-        purge_crawl_enabled: true
+        purge_crawl_enabled: true,
+        full_html_extraction_enabled: false
       }.freeze
 
       # Settings we are not allowed to log due to their sensitive nature

@@ -57,14 +57,15 @@ def core_fields(crawl_result)
       }
     end
 
-    def html_fields(crawl_result)
+    def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize
       remove_empty_values(
         title: crawl_result.document_title(limit: config.max_title_size),
         body: crawl_result.document_body(limit: config.max_body_size),
         meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size),
         meta_description: crawl_result.meta_description(limit: config.max_description_size),
         links: crawl_result.links(limit: config.max_indexed_links_count),
-        headings: crawl_result.headings(limit: config.max_headings_count)
+        headings: crawl_result.headings(limit: config.max_headings_count),
+        full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled)
       )
     end
 

@@ -131,6 +131,26 @@
           expect(result).to eq(expected_result_limited)
         end
       end
+
+      context 'when full HTML extraction is enabled' do
+        let(:config_params) do
+          {
+            domains: [{ url: url.to_s }],
+            full_html_extraction_enabled: true
+          }
+        end
+        let(:expected_result_extracted) do
+          expected_result.merge(
+            full_html: Nokogiri::HTML(content).inner_html
+          )
+        end
+
+        it 'includes the full HTML in the result' do
+          result = subject.create_doc(crawl_result)
+
+          expect(result).to eq(expected_result_extracted)
+        end
+      end
     end
 
     context 'when crawl result is a binary file' do