From ff11d5365ac0217d3f1aeb12ba27c0b335add4f8 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Fri, 12 Jan 2018 22:21:13 +0900 Subject: [PATCH] maxDepth is a constructor option --- README.md | 3 ++- examples/priority-queue.js | 1 + lib/hccrawler.js | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 83fc716c..5e4caa0b 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ HCCrawler.launch({ * `options` <[Object]> * `maxConcurrency` <[number]> Maximum number of pages to open concurrently, defaults to `10`. * `maxRequest` <[number]> Maximum number of requests, defaults to `0`. Pass `0` to disable the limit. + * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `exporter` <[Exporter]> An exporter object which extends [BaseExporter](#class-baseexporter)'s interfaces to export result, default to `null`. * `cache` <[Cache]> A cache object which extends [BaseCache](#class-basecache)'s interfaces to remember and skip duplicate requests, defaults to a [SessionCache](#class-sessioncache) object. * `persistCache` <[boolean]> Whether to clear cache on closing or disconnecting from the browser, defaults to `false`. @@ -185,6 +186,7 @@ url, allowedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, d * `options` <[Object]> * `maxConcurrency` <[number]> Maximum number of pages to open concurrently, defaults to `10`. * `maxRequest` <[number]> Maximum number of requests, defaults to `0`. Pass `0` to disable the limit. + * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `exporter` <[Exporter]> An exporter object which extends [BaseExporter](#class-baseexporter)'s interfaces to export result, default to `null`. * `cache` <[Cache]> A cache object which extends [BaseCache](#class-basecache)'s interfaces to remember and skip duplicate requests, defaults to a [SessionCache](#class-sessioncache) object. * `persistCache` <[boolean]> Whether to clear cache on closing or disconnecting from the browser, defaults to `false`. @@ -233,7 +235,6 @@ See [puppeteer.executablePath()](https://github.com/GoogleChrome/puppeteer/blob/ * `options` <[Object]> * `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`. - * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred. * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`. diff --git a/examples/priority-queue.js b/examples/priority-queue.js index b4cd2d14..9c8b1910 100644 --- a/examples/priority-queue.js +++ b/examples/priority-queue.js @@ -1,6 +1,7 @@ const HCCrawler = require('headless-chrome-crawler'); HCCrawler.launch({ + maxDepth: 3, maxConcurrency: 1, onSuccess: (result => { console.log(`Requested ${result.options.url}.`); diff --git a/lib/hccrawler.js b/lib/hccrawler.js index e55ef932..24079603 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -47,6 +47,7 @@ const LAUNCH_OPTIONS = [ const CONSTRUCTOR_OPTIONS = CONNECT_OPTIONS.concat(LAUNCH_OPTIONS).concat([ 'maxConcurrency', 'maxRequest', + 'maxDepth', 'cache', 'exporter', 'persistCache', @@ -458,7 +459,7 @@ class HCCrawler extends EventEmitter { * @private */ _followLinks(links, options, depth) { - if (depth >= options.maxDepth) { + if (depth >= this._options.maxDepth) { this.emit(HCCrawler.Events.MaxDepthReached); return; }