Skip to content

Commit

Permalink
[0.2] Allow for full HTML extraction (#204) (#208)
Browse files Browse the repository at this point in the history
Backports the following commits to 0.2:
 - Allow for full HTML extraction (#204)

Co-authored-by: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com>
  • Loading branch information
github-actions[bot] and navarone-feekery authored Feb 6, 2025
1 parent ff8b56d commit 67b42c9
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 4 deletions.
7 changes: 7 additions & 0 deletions config/crawler.yml.example
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@
## The maximum depth that Crawler will follow links to.
#max_crawl_depth: 2
#
## Whether or not the crawler should purge outdated documents after completing a crawl. Defaults to true
#purge_crawl_enabled: true
#
## Whether or not to include the full HTML in the crawl result Enabling full HTML extraction can
## dramatically increase the index size if the site being crawled is large. Defaults to false.
#full_html_extraction_enabled: false
#
## Scheduling using cron expressions
#schedule:
# pattern: "0 12 * * *" # every day at noon
Expand Down
6 changes: 4 additions & 2 deletions lib/crawler/api/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class Config # rubocop:disable Metrics/ClassLength
:results_collection, # An Enumerable collection for storing mock crawl results
:user_agent, # The User-Agent used for requests made from the crawler.
:stats_dump_interval, # How often should we output stats in the logs during a crawl
:purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
:purge_crawl_enabled, # Whether or not to purge ES docs after a crawl, only possible for elasticsearch sinks
:full_html_extraction_enabled, # Whether or not to include the full HTML in the crawl result JSON

# Elasticsearch settings
:elasticsearch, # Elasticsearch connection settings
Expand Down Expand Up @@ -180,7 +181,8 @@ class Config # rubocop:disable Metrics/ClassLength

extraction_rules: {},
crawl_rules: {},
purge_crawl_enabled: true
purge_crawl_enabled: true,
full_html_extraction_enabled: false
}.freeze

# Settings we are not allowed to log due to their sensitive nature
Expand Down
5 changes: 3 additions & 2 deletions lib/crawler/document_mapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,15 @@ def core_fields(crawl_result)
}
end

def html_fields(crawl_result)
def html_fields(crawl_result) # rubocop:disable Metrics/AbcSize
remove_empty_values(
title: crawl_result.document_title(limit: config.max_title_size),
body: crawl_result.document_body(limit: config.max_body_size),
meta_keywords: crawl_result.meta_keywords(limit: config.max_keywords_size),
meta_description: crawl_result.meta_description(limit: config.max_description_size),
links: crawl_result.links(limit: config.max_indexed_links_count),
headings: crawl_result.headings(limit: config.max_headings_count)
headings: crawl_result.headings(limit: config.max_headings_count),
full_html: crawl_result.full_html(enabled: config.full_html_extraction_enabled)
)
end

Expand Down
20 changes: 20 additions & 0 deletions spec/lib/crawler/document_mapper_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,26 @@
expect(result).to eq(expected_result_limited)
end
end

context 'when full HTML extraction is enabled' do
let(:config_params) do
{
domains: [{ url: url.to_s }],
full_html_extraction_enabled: true
}
end
let(:expected_result_extracted) do
expected_result.merge(
full_html: Nokogiri::HTML(content).inner_html
)
end

it 'includes the full HTML in the result' do
result = subject.create_doc(crawl_result)

expect(result).to eq(expected_result_extracted)
end
end
end

context 'when crawl result is a binary file' do
Expand Down

0 comments on commit 67b42c9

Please sign in to comment.