From 2c48a7874f53202d81b0a0afd9745363ea46f900 Mon Sep 17 00:00:00 2001
From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:17:10 +0100
Subject: [PATCH 1/3] Enable full HTML extraction

---
 config/crawler.yml.example               |  7 +++++++
 lib/crawler/api/config.rb                |  6 ++++--
 lib/crawler/document_mapper.rb           |  3 ++-
 spec/lib/crawler/document_mapper_spec.rb | 20 ++++++++++++++++++++
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/config/crawler.yml.example b/config/crawler.yml.example
index 1e1322a3..461e74f2 100644
--- a/config/crawler.yml.example
+++ b/config/crawler.yml.example
@@ -54,6 +54,13 @@
 ## The maximum depth that Crawler will follow links to.
 #max_crawl_depth: 2
 #
+## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
+#purge_crawl_enabled: true
+#
+## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
+##   dramatically increase the index size if the site being crawled is large. Defaults to false.
+#extract_full_html: false
+#
 ## Scheduling using cron expressions
 #schedule:
 #  pattern: "0 12 * * *"     # every day at noon
diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb
index 5d30ddae..cd09a4ab 100644
--- a/lib/crawler/api/config.rb
+++ b/lib/crawler/api/config.rb
@@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength
         :results_collection,   # An Enumerable collection for storing mock crawl results
         :user_agent,           # The User-Agent used for requests made from the crawler.
         :stats_dump_interval,  # How often should we output stats in the logs during a crawl
-        :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :purge_crawl_enabled,  # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
+        :extract_full_html,    # Whether or not to include the full HTML in the crawl result JSON
 
         # Elasticsearch settings
         :elasticsearch, # Elasticsearch connection settings
@@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength
 
         extraction_rules: {},
         crawl_rules: {},
-        purge_crawl_enabled: true
+        purge_crawl_enabled: true,
+        extract_full_html: false
       }.freeze
 
       # Settings we are not allowed to log due to their sensitive nature
diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb
index eb4ec6e7..5661813c 100644
--- a/lib/crawler/document_mapper.rb
+++ b/lib/crawler/document_mapper.rb
@@ -64,7 +64,8 @@ def html_fields(crawl_result)
         meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size),
         meta_description: crawl_result.meta_description(limit: config.max_description_size),
         links: crawl_result.links(limit: config.max_indexed_links_count),
-        headings: crawl_result.headings(limit: config.max_headings_count)
+        headings: crawl_result.headings(limit: config.max_headings_count),
+        full_html: crawl_result.full_html(enabled: config.extract_full_html)
       )
     end
 
diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb
index 5caf77c0..4331ab37 100644
--- a/spec/lib/crawler/document_mapper_spec.rb
+++ b/spec/lib/crawler/document_mapper_spec.rb
@@ -131,6 +131,26 @@
           expect(result).to eq(expected_result_limited)
         end
       end
+
+      context "when full HTML extraction is enabled" do
+        let(:config_params) do
+          {
+            domains: [{ url: url.to_s}],
+            extract_full_html: true
+          }
+        end
+        let(:expected_result_extracted) do
+          expected_result.merge(
+            full_html: Nokogiri::HTML(content).inner_html
+          )
+        end
+
+        it 'includes the full HTML in the result' do
+          result = subject.create_doc(crawl_result)
+
+          expect(result).to eq(expected_result_extracted)
+        end
+      end
     end
 
     context 'when crawl result is a binary file' do

From 042a4195758253d9d7a5919a73a591633eb74358 Mon Sep 17 00:00:00 2001
From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:22:18 +0100
Subject: [PATCH 2/3] Rename config field

---
 config/crawler.yml.example               | 2 +-
 lib/crawler/api/config.rb                | 4 ++--
 lib/crawler/document_mapper.rb           | 2 +-
 spec/lib/crawler/document_mapper_spec.rb | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/config/crawler.yml.example b/config/crawler.yml.example
index 461e74f2..9835fc9e 100644
--- a/config/crawler.yml.example
+++ b/config/crawler.yml.example
@@ -59,7 +59,7 @@
 #
 ## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
 ##   dramatically increase the index size if the site being crawled is large. Defaults to false.
-#extract_full_html: false
+#full_html_extraction_enabled: false
 #
 ## Scheduling using cron expressions
 #schedule:
diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb
index cd09a4ab..fd74dbac 100644
--- a/lib/crawler/api/config.rb
+++ b/lib/crawler/api/config.rb
@@ -55,7 +55,7 @@ class Config # rubocop:disable Metrics/ClassLength
         :user_agent,           # The User-Agent used for requests made from the crawler.
         :stats_dump_interval,  # How often should we output stats in the logs during a crawl
         :purge_crawl_enabled,  # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
-        :extract_full_html,    # Whether or not to include the full HTML in the crawl result JSON
+        :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON
 
         # Elasticsearch settings
         :elasticsearch, # Elasticsearch connection settings
@@ -182,7 +182,7 @@ class Config # rubocop:disable Metrics/ClassLength
         extraction_rules: {},
         crawl_rules: {},
         purge_crawl_enabled: true,
-        extract_full_html: false
+        full_html_extraction_enabled: false
       }.freeze
 
       # Settings we are not allowed to log due to their sensitive nature
diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb
index 5661813c..8934eb22 100644
--- a/lib/crawler/document_mapper.rb
+++ b/lib/crawler/document_mapper.rb
@@ -65,7 +65,7 @@ def html_fields(crawl_result)
         meta_description: crawl_result.meta_description(limit: config.max_description_size),
         links: crawl_result.links(limit: config.max_indexed_links_count),
         headings: crawl_result.headings(limit: config.max_headings_count),
-        full_html: crawl_result.full_html(enabled: config.extract_full_html)
+        full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled)
       )
     end
 
diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb
index 4331ab37..bebb9163 100644
--- a/spec/lib/crawler/document_mapper_spec.rb
+++ b/spec/lib/crawler/document_mapper_spec.rb
@@ -136,7 +136,7 @@
         let(:config_params) do
           {
             domains: [{ url: url.to_s}],
-            extract_full_html: true
+            full_html_extraction_enabled: true
           }
         end
         let(:expected_result_extracted) do

From 6ca2a52177d83ac3bce4c33d597afe9b3666b4ec Mon Sep 17 00:00:00 2001
From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
Date: Thu, 6 Feb 2025 11:23:34 +0100
Subject: [PATCH 3/3] Fix lint

---
 lib/crawler/document_mapper.rb           | 2 +-
 spec/lib/crawler/document_mapper_spec.rb | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb
index 8934eb22..43115706 100644
--- a/lib/crawler/document_mapper.rb
+++ b/lib/crawler/document_mapper.rb
@@ -57,7 +57,7 @@ def core_fields(crawl_result)
       }
     end
 
-    def html_fields(crawl_result)
+    def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize
       remove_empty_values(
         title: crawl_result.document_title(limit: config.max_title_size),
         body: crawl_result.document_body(limit: config.max_body_size),
diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb
index bebb9163..1d936533 100644
--- a/spec/lib/crawler/document_mapper_spec.rb
+++ b/spec/lib/crawler/document_mapper_spec.rb
@@ -132,10 +132,10 @@
         end
       end
 
-      context "when full HTML extraction is enabled" do
+      context 'when full HTML extraction is enabled' do
         let(:config_params) do
           {
-            domains: [{ url: url.to_s}],
+            domains: [{ url: url.to_s }],
             full_html_extraction_enabled: true
           }
         end