diff --git a/bin/console b/bin/console deleted file mode 100755 index 95070ab..0000000 --- a/bin/console +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env ruby - -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License 2.0; -# you may not use this file except in compliance with the Elastic License 2.0. -# - -# Load crawler environment -require_relative '../lib/environment' - -# Start a pry console -require 'pry' -Pry.start diff --git a/bin/crawl b/bin/crawl deleted file mode 100755 index 7a5da96..0000000 --- a/bin/crawl +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env ruby - -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License 2.0; -# you may not use this file except in compliance with the Elastic License 2.0. -# - -# Load crawler environment -require_relative '../lib/environment' - -# Standard libraries -require 'getoptlong' -require 'yaml' - -#--------------------------------------------------------------------------------------------------- -def die(message, print_help = false) - puts "ERROR: #{message}" - if print_help - puts - print_usage_help - end - - exit(1) -end - -#--------------------------------------------------------------------------------------------------- -def load_yaml(file_path) - die("Config file #{file_path} does not exist!") unless File.readable?(file_path) - config = - begin - YAML.load_file(file_path) - rescue StandardError => e - die("Failed to load config file #{file_path}: #{e}") - end - config -end - -#--------------------------------------------------------------------------------------------------- -def print_usage_help - puts <<-EOF -Usage: #{$PROGRAM_NAME} [options] - -Where: ---crawl-config Path to crawl config file (required) ---es-config Path to elasticsearch config file (optional) ---debug Enable verbose mode (optional) ---help Shows this help. - -Useful examples: -# #{$PROGRAM_NAME} --es-config elasticsearch.yml --crawl-config crawler.yml - EOF -end - -#--------------------------------------------------------------------------------------------------- -# Defaults -verbose_logging = false - -# Parse options -opts = GetoptLong.new( - ['--debug', '-v', GetoptLong::NO_ARGUMENT], - ['--help', '-h', GetoptLong::NO_ARGUMENT], - ['--es-config', GetoptLong::REQUIRED_ARGUMENT], - ['--crawl-config', GetoptLong::REQUIRED_ARGUMENT] -) - -es_config = {} -crawl_config = nil - -# Process options -begin - opts.each do |opt, arg| - case opt - when '--debug' - verbose_logging = true - when '--help' - print_usage_help - exit(0) - when '--es-config' - es_config = load_yaml(arg) - when '--crawl-config' - crawl_config = load_yaml(arg) - else - die("#{opt} is not a supported option. Use #{$PROGRAM_NAME} --help to see supported options.") - end - end -rescue GetoptLong::Error => e - puts - die(e, true) -end - -# Require a crawl config -die('Please specify the crawl config file.') if crawl_config == nil - -# Combine configs and apply to crawler -config = es_config.merge(crawl_config) -crawl_config = Crawler::API::Config.new(**config.deep_symbolize_keys) -crawl = Crawler::API::Crawl.new(crawl_config) - -# Perform the crawl! -crawl.start! diff --git a/docs/CONFIG.md b/docs/CONFIG.md index 47aad98..2ea444b 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -3,10 +3,10 @@ Configuration files live in the [config](../config) directory. There are two kinds of configuration files: -1. Crawler configurations (provided in CLI with `--crawler-config`) -2. Elasticsearch configurations (provided in CLI with `--es-config`) +1. Crawler configurations (provided as a positional argument) +2. Elasticsearch configurations (provided as an optional argument with `--es-config`) -There are two configuration files to allow crawl jobs to share Elasticsearch instance configuration. +There two configuration file arguments allow crawl jobs to share Elasticsearch instance configuration. There are no enforced pathing or naming for these files. They are differentiated only by how they are provided to the CLI when running a crawl. @@ -16,7 +16,7 @@ Crawler configuration files are required for all crawl jobs. If `elasticsearch` is the output sink, the elasticsearch instance configuration can also be included in a crawler configuration file. If the elasticsearch configuration is provided this way, it will override any configuration provided in an elasticsearch configuration file. -These are provided in the CLI as an argument for the option `--crawl-config`. +These are provided in the CLI as a positional argument, e.g. `bin/crawler crawl path/to/my-crawler.yml`. ## Elasticsearch configuration files @@ -27,7 +27,7 @@ This configuration is also optional. All of the configuration in this file can be provided in a crawler configuration file as well. The crawler config is loaded after the Elasticsearch config, so any Elasticsearch settings in the crawler config will take priority. -These are provided in the CLI as an argument for the option `--es-config`. +These are provided in the CLI as a named argument for the option `--es-config`, e.g. `bin/crawler crawl path/to/my-crawler.yml --es-config=/path/to/elasticsearch.yml` ## Configuration files in Docker @@ -46,13 +46,13 @@ The order of the opts is not important. When performing a crawl with only a crawl config: ```shell -$ bin/crawl --crawl-config config/my-crawler.yml +$ bin/crawler crawl config/my-crawler.yml ``` When performing a crawl with only both a crawl config and an Elasticsearch config: ```shell -$ bin/crawl --crawl-config config/my-crawler.yml --es-config config/elasticsearch.yml +$ bin/crawler crawl config/my-crawler.yml --es-config config/elasticsearch.yml ``` ## Example configurations