From a81d29f2d801338f17fdf3dd5120222a5efc9e35 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Wed, 21 Feb 2018 19:10:02 +0900 Subject: [PATCH 1/2] feat: support depth first ordering --- CHANGELOG.md | 2 +- README.md | 7 ++++--- lib/hccrawler.js | 42 ++++++++++++++++++++---------------------- test/hccrawler.test.js | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11b7a2dd..3de6cea5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Emit `newpage` event. -- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options. +- Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options. ### changed diff --git a/README.md b/README.md index 590a800b..492e3e3e 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t * Distributed crawling * Configure concurrency, delay and retry -* Breadth-first search (BFS) to automatically follow links +* Support both [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search) and [breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search) algorithm * Pluggable cache storages such as [Redis](https://redis.io) * Support [CSV](https://tools.ietf.org/html/rfc4180) and [JSON Lines](http://jsonlines.org) for exporting results * Pause at the max request and resume at any time @@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed. ``` -url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage +url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage ``` > **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity. @@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed. ``` -url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage +url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage ``` > **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity. @@ -239,6 +239,7 @@ url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryD * `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`. * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred. + * `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search). * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`. * `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`. diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 857e6dd3..8b79910c 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -113,6 +113,7 @@ class HCCrawler extends EventEmitter { jQuery: true, persistCache: false, skipDuplicates: true, + depthPriority: true, obeyRobotsTxt: true, followSitemapXml: false, screenshot: null, @@ -128,7 +129,7 @@ class HCCrawler extends EventEmitter { this._onSuccess = options.onSuccess || null; this._onError = options.onError || null; this._exportHeader(); - this._queue.on('pull', (...args) => this._onPull(...args)); + this._queue.on('pull', (...args) => this._startRequest(...args)); this._browser.on('disconnected', () => { this.emit(HCCrawler.Events.Disconnected); }); @@ -158,7 +159,7 @@ class HCCrawler extends EventEmitter { if (!mergedOptions.url) throw new Error('Url must be defined!'); if (mergedOptions.device && !includes(deviceNames, mergedOptions.device)) throw new Error('Specified device is not supported!'); if (mergedOptions.delay > 0 && mergedOptions.maxConcurrency !== 1) throw new Error('Max concurrency must be 1 when delay is set!'); - this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS)); + this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS), 1); }); } @@ -266,10 +267,12 @@ class HCCrawler extends EventEmitter { /** * @param {!Object} options - * @param {!number=} depth + * @param {!number} depth */ - _push(options, depth = 1) { - this._queue.push(options, depth, options.priority); + _push(options, depth) { + let { priority } = options; + if (!priority && options.depthPriority) priority = depth; + this._queue.push(options, depth, priority); } /** @@ -278,7 +281,7 @@ class HCCrawler extends EventEmitter { * @return {!Promise} * @private */ - _onPull(options, depth) { + _startRequest(options, depth) { return this._skipRequest(options) .then(skip => { if (skip) { @@ -286,7 +289,11 @@ class HCCrawler extends EventEmitter { return Promise.resolve(); } return this._followSitemap(options, depth) - .then(() => this._request(options, depth)); + .then(() => this._request(options, depth)) + .then(links => { + this._checkRequestCount(); + return delay(options.delay).then(() => this._followLinks(links, options, depth)); + }); }); } @@ -303,9 +310,7 @@ class HCCrawler extends EventEmitter { this._shouldRequest(options), ]) .then(([requested, allowedRobot, allowedDomain, shouldRequest]) => { - if (requested || !allowedRobot || !allowedDomain || !shouldRequest) { - return true; - } + if (requested || !allowedRobot || !allowedDomain || !shouldRequest) return true; return false; }); } @@ -314,7 +319,7 @@ class HCCrawler extends EventEmitter { * @param {!Object} options * @param {!number} depth * @param {!number=} retryCount - * @return {!Promise} + * @return {!Promise} * @private */ _request(options, depth, retryCount = 0) { @@ -324,18 +329,12 @@ class HCCrawler extends EventEmitter { this.emit(HCCrawler.Events.NewPage, crawler.page()); return crawler.crawl() .then(res => { - res = extend({}, res); - res.options = options; - res.depth = depth; + res = extend({ options, depth }, res); this.emit(HCCrawler.Events.RequestFinished, res); return this._success(res) - .then(() => { - this._exportLine(res); - this._checkRequestCount(); - this._followLinks(res.links, options, depth); - }) + .then(() => { void this._exportLine(res); }) .then(() => crawler.close()) - .then(() => delay(options.delay)); + .then(() => res.links); }) .catch(error => { if (retryCount >= options.retryCount) throw error; @@ -348,9 +347,8 @@ class HCCrawler extends EventEmitter { .catch(error => { this.emit(HCCrawler.Events.RequestFailed, error); return this._error(error) - .then(() => void this._checkRequestCount()) .then(() => crawler.close()) - .then(() => delay(options.delay)); + .then(() => []); }); }); } diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js index ad6828a4..932bfae4 100644 --- a/test/hccrawler.test.js +++ b/test/hccrawler.test.js @@ -512,6 +512,38 @@ describe('HCCrawler', () => { assert.equal(onSuccess.callCount, 3); }); }); + + context('when the first page contains several links', () => { + beforeEach(() => { + server.setContent('/1.html', ` + go to /2.html + go to /3.html + `); + server.setContent('/2.html', `go to /4.html`); + }); + + it('follow links with depth first order with maxDepth = 3', () => { + crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3 }); + return crawler.onIdle() + .then(() => { + assert.equal(onSuccess.callCount, 4); + assert.equal(onSuccess.firstCall.args[0].depth, 1); + assert.equal(onSuccess.secondCall.args[0].depth, 2); + assert.equal(onSuccess.thirdCall.args[0].depth, 3); + }); + }); + + it('follow links with breadth first order with maxDepth = 3 and depthPriority = false', () => { + crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3, depthPriority: false }); + return crawler.onIdle() + .then(() => { + assert.equal(onSuccess.callCount, 4); + assert.equal(onSuccess.firstCall.args[0].depth, 1); + assert.equal(onSuccess.secondCall.args[0].depth, 2); + assert.equal(onSuccess.thirdCall.args[0].depth, 2); + }); + }); + }); }); context('when the crawler is launched with maxRequest option', () => { From 1cdbb9955e0d5c44011f4f7ee74cfc904c5a5176 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Wed, 21 Feb 2018 19:19:36 +0900 Subject: [PATCH 2/2] test(helper): add tests for Helper.checkDomainMatch() --- test/helper.test.js | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/helper.test.js b/test/helper.test.js index 65cfdbd6..cb7e259d 100644 --- a/test/helper.test.js +++ b/test/helper.test.js @@ -9,6 +9,7 @@ const { escapeQuotes, getRobotsUrl, lowerBound, + checkDomainMatch, getSitemapUrls, unescape, stringifyArgument, @@ -217,6 +218,38 @@ describe('Helper', () => { }); }); + describe('Helper.checkDomainMatch', () => { + it('returns false for empty array', () => { + const actual = checkDomainMatch([], '127.0.0.1'); + const expected = false; + assert.equal(actual, expected); + }); + + it('returns false when no domain fully matches requested hostname', () => { + const actual = checkDomainMatch(['localhost', '0.0.0.0'], '127.0.0.1'); + const expected = false; + assert.equal(actual, expected); + }); + + it('returns false when no domain matches requested hostname by regular expression', () => { + const actual = checkDomainMatch([/^localhost$/, /^\d\.\d\.\d\.\d$/], '127.0.0.1'); + const expected = false; + assert.equal(actual, expected); + }); + + it('returns true when a domain fully matches requested hostname', () => { + const actual = checkDomainMatch(['localhost', '127.0.0.1'], '127.0.0.1'); + const expected = true; + assert.equal(actual, expected); + }); + + it('returns true when a domain fully matches requested hostname by regular expression', () => { + const actual = checkDomainMatch([/^localhost$/, /^\d+\.\d+\.\d+\.\d+$/], '127.0.0.1'); + const expected = true; + assert.equal(actual, expected); + }); + }); + describe('Helper.getSitemapUrls', () => { it('returns empty array for empty xml', () => { const actual = getSitemapUrls('');