From 2c48a7874f53202d81b0a0afd9745363ea46f900 Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:17:10 +0100 Subject: [PATCH 1/3] Enable full HTML extraction --- config/crawler.yml.example | 7 +++++++ lib/crawler/api/config.rb | 6 ++++-- lib/crawler/document_mapper.rb | 3 ++- spec/lib/crawler/document_mapper_spec.rb | 20 ++++++++++++++++++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/config/crawler.yml.example b/config/crawler.yml.example index 1e1322a3..461e74f2 100644 --- a/config/crawler.yml.example +++ b/config/crawler.yml.example @@ -54,6 +54,13 @@ ## The maximum depth that Crawler will follow links to. #max_crawl_depth: 2 # +## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true +#purge_crawl_enabled: true +# +## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can +## dramatically increase the index size if the site being crawled is large. Defaults to false. +#extract_full_html: false +# ## Scheduling using cron expressions #schedule: # pattern: "0 12 * * *" # every day at noon diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb index 5d30ddae..cd09a4ab 100644 --- a/lib/crawler/api/config.rb +++ b/lib/crawler/api/config.rb @@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength :results_collection, # An Enumerable collection for storing mock crawl results :user_agent, # The User-Agent used for requests made from the crawler. :stats_dump_interval, # How often should we output stats in the logs during a crawl - :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :extract_full_html, # Whether or not to include the full HTML in the crawl result JSON # Elasticsearch settings :elasticsearch, # Elasticsearch connection settings @@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength extraction_rules: {}, crawl_rules: {}, - purge_crawl_enabled: true + purge_crawl_enabled: true, + extract_full_html: false }.freeze # Settings we are not allowed to log due to their sensitive nature diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb index eb4ec6e7..5661813c 100644 --- a/lib/crawler/document_mapper.rb +++ b/lib/crawler/document_mapper.rb @@ -64,7 +64,8 @@ def html_fields(crawl_result) meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size), meta_description: crawl_result.meta_description(limit: config.max_description_size), links: crawl_result.links(limit: config.max_indexed_links_count), - headings: crawl_result.headings(limit: config.max_headings_count) + headings: crawl_result.headings(limit: config.max_headings_count), + full_html: crawl_result.full_html(enabled: config.extract_full_html) ) end diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb index 5caf77c0..4331ab37 100644 --- a/spec/lib/crawler/document_mapper_spec.rb +++ b/spec/lib/crawler/document_mapper_spec.rb @@ -131,6 +131,26 @@ expect(result).to eq(expected_result_limited) end end + + context "when full HTML extraction is enabled" do + let(:config_params) do + { + domains: [{ url: url.to_s}], + extract_full_html: true + } + end + let(:expected_result_extracted) do + expected_result.merge( + full_html: Nokogiri::HTML(content).inner_html + ) + end + + it 'includes the full HTML in the result' do + result = subject.create_doc(crawl_result) + + expect(result).to eq(expected_result_extracted) + end + end end context 'when crawl result is a binary file' do From 042a4195758253d9d7a5919a73a591633eb74358 Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:22:18 +0100 Subject: [PATCH 2/3] Rename config field --- config/crawler.yml.example | 2 +- lib/crawler/api/config.rb | 4 ++-- lib/crawler/document_mapper.rb | 2 +- spec/lib/crawler/document_mapper_spec.rb | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/config/crawler.yml.example b/config/crawler.yml.example index 461e74f2..9835fc9e 100644 --- a/config/crawler.yml.example +++ b/config/crawler.yml.example @@ -59,7 +59,7 @@ # ## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can ## dramatically increase the index size if the site being crawled is large. Defaults to false. -#extract_full_html: false +#full_html_extraction_enabled: false # ## Scheduling using cron expressions #schedule: diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb index cd09a4ab..fd74dbac 100644 --- a/lib/crawler/api/config.rb +++ b/lib/crawler/api/config.rb @@ -55,7 +55,7 @@ class Config # rubocop:disable Metrics/ClassLength :user_agent, # The User-Agent used for requests made from the crawler. :stats_dump_interval, # How often should we output stats in the logs during a crawl :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks - :extract_full_html, # Whether or not to include the full HTML in the crawl result JSON + :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON # Elasticsearch settings :elasticsearch, # Elasticsearch connection settings @@ -182,7 +182,7 @@ class Config # rubocop:disable Metrics/ClassLength extraction_rules: {}, crawl_rules: {}, purge_crawl_enabled: true, - extract_full_html: false + full_html_extraction_enabled: false }.freeze # Settings we are not allowed to log due to their sensitive nature diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb index 5661813c..8934eb22 100644 --- a/lib/crawler/document_mapper.rb +++ b/lib/crawler/document_mapper.rb @@ -65,7 +65,7 @@ def html_fields(crawl_result) meta_description: crawl_result.meta_description(limit: config.max_description_size), links: crawl_result.links(limit: config.max_indexed_links_count), headings: crawl_result.headings(limit: config.max_headings_count), - full_html: crawl_result.full_html(enabled: config.extract_full_html) + full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled) ) end diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb index 4331ab37..bebb9163 100644 --- a/spec/lib/crawler/document_mapper_spec.rb +++ b/spec/lib/crawler/document_mapper_spec.rb @@ -136,7 +136,7 @@ let(:config_params) do { domains: [{ url: url.to_s}], - extract_full_html: true + full_html_extraction_enabled: true } end let(:expected_result_extracted) do From 6ca2a52177d83ac3bce4c33d597afe9b3666b4ec Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Thu, 6 Feb 2025 11:23:34 +0100 Subject: [PATCH 3/3] Fix lint --- lib/crawler/document_mapper.rb | 2 +- spec/lib/crawler/document_mapper_spec.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb index 8934eb22..43115706 100644 --- a/lib/crawler/document_mapper.rb +++ b/lib/crawler/document_mapper.rb @@ -57,7 +57,7 @@ def core_fields(crawl_result) } end - def html_fields(crawl_result) + def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize remove_empty_values( title: crawl_result.document_title(limit: config.max_title_size), body: crawl_result.document_body(limit: config.max_body_size), diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb index bebb9163..1d936533 100644 --- a/spec/lib/crawler/document_mapper_spec.rb +++ b/spec/lib/crawler/document_mapper_spec.rb @@ -132,10 +132,10 @@ end end - context "when full HTML extraction is enabled" do + context 'when full HTML extraction is enabled' do let(:config_params) do { - domains: [{ url: url.to_s}], + domains: [{ url: url.to_s }], full_html_extraction_enabled: true } end