[0.2] Allow for full HTML extraction (#204) (#208)

Backports the following commits to 0.2: - Allow for full HTML extraction (#204) Co-authored-by: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
elastic · Feb 6, 2025 · 67b42c9 · 67b42c9
1 parent ff8b56d
commit 67b42c9
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 4 deletions.
diff --git a/config/crawler.yml.example b/config/crawler.yml.example
@@ -54,6 +54,13 @@
 ## The maximum depth that Crawler will follow links to.
 #max_crawl_depth: 2
 #
+## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
+#purge_crawl_enabled: true
+#
+## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
+##   dramatically increase the index size if the site being crawled is large. Defaults to false.
+#full_html_extraction_enabled: false
+#
 ## Scheduling using cron expressions
 #schedule:
 #  pattern: "0 12 * * *"     # every day at noon

diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb
@@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength
         :results_collection,   # An Enumerable collection for storing mock crawl results
         :user_agent,           # The User-Agent used for requests made from the crawler.
         :stats_dump_interval,  # How often should we output stats in the logs during a crawl
-        :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :purge_crawl_enabled,  # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON
 
         # Elasticsearch settings
         :elasticsearch, # Elasticsearch connection settings
@@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength
 
         extraction_rules: {},
         crawl_rules: {},
-        purge_crawl_enabled: true
+        purge_crawl_enabled: true,
+        full_html_extraction_enabled: false
       }.freeze
 
       # Settings we are not allowed to log due to their sensitive nature

diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb
@@ -57,14 +57,15 @@ def core_fields(crawl_result)
       }
     end
 
-    def html_fields(crawl_result)
+    def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize
       remove_empty_values(
         title: crawl_result.document_title(limit: config.max_title_size),
         body: crawl_result.document_body(limit: config.max_body_size),
         meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size),
         meta_description: crawl_result.meta_description(limit: config.max_description_size),
         links: crawl_result.links(limit: config.max_indexed_links_count),
-        headings: crawl_result.headings(limit: config.max_headings_count)
+        headings: crawl_result.headings(limit: config.max_headings_count),
+        full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled)
       )
     end
 

diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb
@@ -131,6 +131,26 @@
           expect(result).to eq(expected_result_limited)
         end
       end
+
+      context 'when full HTML extraction is enabled' do
+        let(:config_params) do
+          {
+            domains: [{ url: url.to_s }],
+            full_html_extraction_enabled: true
+          }
+        end
+        let(:expected_result_extracted) do
+          expected_result.merge(
+            full_html: Nokogiri::HTML(content).inner_html
+          )
+        end
+
+        it 'includes the full HTML in the result' do
+          result = subject.create_doc(crawl_result)
+
+          expect(result).to eq(expected_result_extracted)
+        end
+      end
     end
 
     context 'when crawl result is a binary file' do