diff --git a/config/crawler.yml.example b/config/crawler.yml.example index 1e1322a..9835fc9 100644 --- a/config/crawler.yml.example +++ b/config/crawler.yml.example @@ -54,6 +54,13 @@ ## The maximum depth that Crawler will follow links to. #max_crawl_depth: 2 # +## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true +#purge_crawl_enabled: true +# +## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can +## dramatically increase the index size if the site being crawled is large. Defaults to false. +#full_html_extraction_enabled: false +# ## Scheduling using cron expressions #schedule: # pattern: "0 12 * * *" # every day at noon diff --git a/lib/crawler/api/config.rb b/lib/crawler/api/config.rb index 5d30dda..fd74dba 100644 --- a/lib/crawler/api/config.rb +++ b/lib/crawler/api/config.rb @@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength :results_collection, # An Enumerable collection for storing mock crawl results :user_agent, # The User-Agent used for requests made from the crawler. :stats_dump_interval, # How often should we output stats in the logs during a crawl - :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks + :full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON # Elasticsearch settings :elasticsearch, # Elasticsearch connection settings @@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength extraction_rules: {}, crawl_rules: {}, - purge_crawl_enabled: true + purge_crawl_enabled: true, + full_html_extraction_enabled: false }.freeze # Settings we are not allowed to log due to their sensitive nature diff --git a/lib/crawler/document_mapper.rb b/lib/crawler/document_mapper.rb index eb4ec6e..4311570 100644 --- a/lib/crawler/document_mapper.rb +++ b/lib/crawler/document_mapper.rb @@ -57,14 +57,15 @@ def core_fields(crawl_result) } end - def html_fields(crawl_result) + def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize remove_empty_values( title: crawl_result.document_title(limit: config.max_title_size), body: crawl_result.document_body(limit: config.max_body_size), meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size), meta_description: crawl_result.meta_description(limit: config.max_description_size), links: crawl_result.links(limit: config.max_indexed_links_count), - headings: crawl_result.headings(limit: config.max_headings_count) + headings: crawl_result.headings(limit: config.max_headings_count), + full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled) ) end diff --git a/spec/lib/crawler/document_mapper_spec.rb b/spec/lib/crawler/document_mapper_spec.rb index 5caf77c..1d93653 100644 --- a/spec/lib/crawler/document_mapper_spec.rb +++ b/spec/lib/crawler/document_mapper_spec.rb @@ -131,6 +131,26 @@ expect(result).to eq(expected_result_limited) end end + + context 'when full HTML extraction is enabled' do + let(:config_params) do + { + domains: [{ url: url.to_s }], + full_html_extraction_enabled: true + } + end + let(:expected_result_extracted) do + expected_result.merge( + full_html: Nokogiri::HTML(content).inner_html + ) + end + + it 'includes the full HTML in the result' do + result = subject.create_doc(crawl_result) + + expect(result).to eq(expected_result_extracted) + end + end end context 'when crawl result is a binary file' do