From e310f4b66ab6d287e0386bf815698b153ccf682d Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Wed, 29 May 2024 16:50:45 +0200 Subject: [PATCH 1/3] Remove old CLI files --- bin/console | 14 -------- bin/crawl | 101 ---------------------------------------------------- 2 files changed, 115 deletions(-) delete mode 100755 bin/console delete mode 100755 bin/crawl diff --git a/bin/console b/bin/console deleted file mode 100755 index 95070ab..0000000 --- a/bin/console +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env ruby - -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License 2.0; -# you may not use this file except in compliance with the Elastic License 2.0. -# - -# Load crawler environment -require_relative '../lib/environment' - -# Start a pry console -require 'pry' -Pry.start diff --git a/bin/crawl b/bin/crawl deleted file mode 100755 index 7a5da96..0000000 --- a/bin/crawl +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env ruby - -# -# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one -# or more contributor license agreements. Licensed under the Elastic License 2.0; -# you may not use this file except in compliance with the Elastic License 2.0. -# - -# Load crawler environment -require_relative '../lib/environment' - -# Standard libraries -require 'getoptlong' -require 'yaml' - -#--------------------------------------------------------------------------------------------------- -def die(message, print_help = false) - puts "ERROR: #{message}" - if print_help - puts - print_usage_help - end - - exit(1) -end - -#--------------------------------------------------------------------------------------------------- -def load_yaml(file_path) - die("Config file #{file_path} does not exist!") unless File.readable?(file_path) - config = - begin - YAML.load_file(file_path) - rescue StandardError => e - die("Failed to load config file #{file_path}: #{e}") - end - config -end - -#--------------------------------------------------------------------------------------------------- -def print_usage_help - puts <<-EOF -Usage: #{$PROGRAM_NAME} [options] - -Where: ---crawl-config Path to crawl config file (required) ---es-config Path to elasticsearch config file (optional) ---debug Enable verbose mode (optional) ---help Shows this help. - -Useful examples: -# #{$PROGRAM_NAME} --es-config elasticsearch.yml --crawl-config crawler.yml - EOF -end - -#--------------------------------------------------------------------------------------------------- -# Defaults -verbose_logging = false - -# Parse options -opts = GetoptLong.new( - ['--debug', '-v', GetoptLong::NO_ARGUMENT], - ['--help', '-h', GetoptLong::NO_ARGUMENT], - ['--es-config', GetoptLong::REQUIRED_ARGUMENT], - ['--crawl-config', GetoptLong::REQUIRED_ARGUMENT] -) - -es_config = {} -crawl_config = nil - -# Process options -begin - opts.each do |opt, arg| - case opt - when '--debug' - verbose_logging = true - when '--help' - print_usage_help - exit(0) - when '--es-config' - es_config = load_yaml(arg) - when '--crawl-config' - crawl_config = load_yaml(arg) - else - die("#{opt} is not a supported option. Use #{$PROGRAM_NAME} --help to see supported options.") - end - end -rescue GetoptLong::Error => e - puts - die(e, true) -end - -# Require a crawl config -die('Please specify the crawl config file.') if crawl_config == nil - -# Combine configs and apply to crawler -config = es_config.merge(crawl_config) -crawl_config = Crawler::API::Config.new(**config.deep_symbolize_keys) -crawl = Crawler::API::Crawl.new(crawl_config) - -# Perform the crawl! -crawl.start! From 9e9059f2dc4fcdf8a025fd6d891fd594c9fe98ed Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Wed, 29 May 2024 16:50:52 +0200 Subject: [PATCH 2/3] Update CONFIG.md --- docs/CONFIG.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index b86ce17..d6b62f4 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -3,10 +3,10 @@ Configuration files live in the [config]('../config') directory. There are two kinds of configuration files: -1. Crawler configurations (provided in CLI with `--crawler-config`) -2. Elasticsearch configurations (provided in CLI with `--es-config`) +1. Crawler configurations (provided as a positional argument) +2. Elasticsearch configurations (provided as an optional argument with `--es-config`) -There are two configuration files to allow crawl jobs to share Elasticsearch instance configuration. +There two configuration file arguments allow crawl jobs to share Elasticsearch instance configuration. There are no enforced pathing or naming for these files. They are differentiated only by how they are provided to the CLI when running a crawl. @@ -16,7 +16,7 @@ Crawler configuration files are required for all crawl jobs. If `elasticsearch` is the output sink, the elasticsearch instance configuration can also be included in a crawler configuration file. If the elasticsearch configuration is provided this way, it will override any configuration provided in an elasticsearch configuration file. -These are provided in the CLI as an argument for the option `--crawl-config`. +These are provided in the CLI as a positional argument, e.g. `bin/crawler crawl path/to/config.yml`. ## Elasticsearch configuration files @@ -27,7 +27,7 @@ This configuration is also optional. All of the configuration in this file can be provided in a crawler configuration file as well. The crawler config is loaded after the Elasticsearch config, so any Elasticsearch settings in the crawler config will take priority. -These are provided in the CLI as an argument for the option `--es-config`. +These are provided in the CLI as a named argument for the option `--es-config`, e.g. `bin/crawler crawl path/to/config.yml --es-config=/path/to/es-config.yml` ## Configuration files in Docker @@ -46,13 +46,13 @@ The order of the opts is not important. When performing a crawl with only a crawl config: ```shell -$ bin/crawl --crawl-config config/my-crawler.yml +$ bin/crawler crawl config/my-crawler.yml ``` When performing a crawl with only both a crawl config and an Elasticsearch config: ```shell -$ bin/crawl --crawl-config config/my-crawler.yml --es-config config/elasticsearch.yml +$ bin/crawler crawl config/my-crawler.yml --es-config config/elasticsearch.yml ``` ## Example configurations From 57b28cc5854a277c4b0f2ad5031e75b69fc7a26f Mon Sep 17 00:00:00 2001 From: Navarone Feekery <13634519+navarone-feekery@users.noreply.github.com> Date: Wed, 29 May 2024 16:53:23 +0200 Subject: [PATCH 3/3] Update CONFIG.md --- docs/CONFIG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/CONFIG.md b/docs/CONFIG.md index d6b62f4..0633076 100644 --- a/docs/CONFIG.md +++ b/docs/CONFIG.md @@ -16,7 +16,7 @@ Crawler configuration files are required for all crawl jobs. If `elasticsearch` is the output sink, the elasticsearch instance configuration can also be included in a crawler configuration file. If the elasticsearch configuration is provided this way, it will override any configuration provided in an elasticsearch configuration file. -These are provided in the CLI as a positional argument, e.g. `bin/crawler crawl path/to/config.yml`. +These are provided in the CLI as a positional argument, e.g. `bin/crawler crawl path/to/my-crawler.yml`. ## Elasticsearch configuration files @@ -27,7 +27,7 @@ This configuration is also optional. All of the configuration in this file can be provided in a crawler configuration file as well. The crawler config is loaded after the Elasticsearch config, so any Elasticsearch settings in the crawler config will take priority. -These are provided in the CLI as a named argument for the option `--es-config`, e.g. `bin/crawler crawl path/to/config.yml --es-config=/path/to/es-config.yml` +These are provided in the CLI as a named argument for the option `--es-config`, e.g. `bin/crawler crawl path/to/my-crawler.yml --es-config=/path/to/elasticsearch.yml` ## Configuration files in Docker