diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11b7a2dd..3de6cea5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Added
- Emit `newpage` event.
-- Support `deniedDomains` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
+- Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options.
### changed
diff --git a/README.md b/README.md
index 590a800b..492e3e3e 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Powered by Headless Chrome, the crawler provides [simple APIs](#api-reference) t
* Distributed crawling
* Configure concurrency, delay and retry
-* Breadth-first search (BFS) to automatically follow links
+* Support both [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search) and [breadth-first search](https://en.wikipedia.org/wiki/Breadth-first_search) algorithm
* Pluggable cache storages such as [Redis](https://redis.io)
* Support [CSV](https://tools.ietf.org/html/rfc4180) and [JSON Lines](http://jsonlines.org) for exporting results
* Pause at the max request and resume at any time
@@ -180,7 +180,7 @@ browserWSEndpoint, ignoreHTTPSErrors
Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
```
-url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
```
> **Note**: In practice, setting the options every time you queue equests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -220,7 +220,7 @@ ignoreHTTPSErrors, headless, executablePath, slowMo, args, ignoreDefaultArgs, ha
Also, the following options can be set as default values when [crawler.queue()](#crawlerqueueoptions) are executed.
```
-url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
+url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, retryCount, retryDelay, jQuery, device, username, password, evaluatePage
```
> **Note**: In practice, setting the options every time you queue the requests is redundant. Therefore, it's recommended to set the default values and override them depending on the necessity.
@@ -239,6 +239,7 @@ url, allowedDomains, deniedDomains, timeout, priority, delay, retryCount, retryD
* `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`.
* `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links.
* `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred.
+ * `depthPriority` <[boolean]> Whether to adjust priority based on its depth, defaults to `true`. Leave default to increase priority for higher depth, which is [depth-first search](https://en.wikipedia.org/wiki/Depth-first_search).
* `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same.
* `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`.
* `followSitemapXml` <[boolean]> Whether to use [sitemap.xml](https://www.sitemaps.org/) to find locations, default to `false`.
diff --git a/lib/hccrawler.js b/lib/hccrawler.js
index 857e6dd3..8b79910c 100644
--- a/lib/hccrawler.js
+++ b/lib/hccrawler.js
@@ -113,6 +113,7 @@ class HCCrawler extends EventEmitter {
jQuery: true,
persistCache: false,
skipDuplicates: true,
+ depthPriority: true,
obeyRobotsTxt: true,
followSitemapXml: false,
screenshot: null,
@@ -128,7 +129,7 @@ class HCCrawler extends EventEmitter {
this._onSuccess = options.onSuccess || null;
this._onError = options.onError || null;
this._exportHeader();
- this._queue.on('pull', (...args) => this._onPull(...args));
+ this._queue.on('pull', (...args) => this._startRequest(...args));
this._browser.on('disconnected', () => {
this.emit(HCCrawler.Events.Disconnected);
});
@@ -158,7 +159,7 @@ class HCCrawler extends EventEmitter {
if (!mergedOptions.url) throw new Error('Url must be defined!');
if (mergedOptions.device && !includes(deviceNames, mergedOptions.device)) throw new Error('Specified device is not supported!');
if (mergedOptions.delay > 0 && mergedOptions.maxConcurrency !== 1) throw new Error('Max concurrency must be 1 when delay is set!');
- this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS));
+ this._push(omit(mergedOptions, CONSTRUCTOR_OPTIONS), 1);
});
}
@@ -266,10 +267,12 @@ class HCCrawler extends EventEmitter {
/**
* @param {!Object} options
- * @param {!number=} depth
+ * @param {!number} depth
*/
- _push(options, depth = 1) {
- this._queue.push(options, depth, options.priority);
+ _push(options, depth) {
+ let { priority } = options;
+ if (!priority && options.depthPriority) priority = depth;
+ this._queue.push(options, depth, priority);
}
/**
@@ -278,7 +281,7 @@ class HCCrawler extends EventEmitter {
* @return {!Promise}
* @private
*/
- _onPull(options, depth) {
+ _startRequest(options, depth) {
return this._skipRequest(options)
.then(skip => {
if (skip) {
@@ -286,7 +289,11 @@ class HCCrawler extends EventEmitter {
return Promise.resolve();
}
return this._followSitemap(options, depth)
- .then(() => this._request(options, depth));
+ .then(() => this._request(options, depth))
+ .then(links => {
+ this._checkRequestCount();
+ return delay(options.delay).then(() => this._followLinks(links, options, depth));
+ });
});
}
@@ -303,9 +310,7 @@ class HCCrawler extends EventEmitter {
this._shouldRequest(options),
])
.then(([requested, allowedRobot, allowedDomain, shouldRequest]) => {
- if (requested || !allowedRobot || !allowedDomain || !shouldRequest) {
- return true;
- }
+ if (requested || !allowedRobot || !allowedDomain || !shouldRequest) return true;
return false;
});
}
@@ -314,7 +319,7 @@ class HCCrawler extends EventEmitter {
* @param {!Object} options
* @param {!number} depth
* @param {!number=} retryCount
- * @return {!Promise}
+ * @return {!Promise}
* @private
*/
_request(options, depth, retryCount = 0) {
@@ -324,18 +329,12 @@ class HCCrawler extends EventEmitter {
this.emit(HCCrawler.Events.NewPage, crawler.page());
return crawler.crawl()
.then(res => {
- res = extend({}, res);
- res.options = options;
- res.depth = depth;
+ res = extend({ options, depth }, res);
this.emit(HCCrawler.Events.RequestFinished, res);
return this._success(res)
- .then(() => {
- this._exportLine(res);
- this._checkRequestCount();
- this._followLinks(res.links, options, depth);
- })
+ .then(() => { void this._exportLine(res); })
.then(() => crawler.close())
- .then(() => delay(options.delay));
+ .then(() => res.links);
})
.catch(error => {
if (retryCount >= options.retryCount) throw error;
@@ -348,9 +347,8 @@ class HCCrawler extends EventEmitter {
.catch(error => {
this.emit(HCCrawler.Events.RequestFailed, error);
return this._error(error)
- .then(() => void this._checkRequestCount())
.then(() => crawler.close())
- .then(() => delay(options.delay));
+ .then(() => []);
});
});
}
diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js
index ad6828a4..932bfae4 100644
--- a/test/hccrawler.test.js
+++ b/test/hccrawler.test.js
@@ -512,6 +512,38 @@ describe('HCCrawler', () => {
assert.equal(onSuccess.callCount, 3);
});
});
+
+ context('when the first page contains several links', () => {
+ beforeEach(() => {
+ server.setContent('/1.html', `
+ go to /2.html
+ go to /3.html
+ `);
+ server.setContent('/2.html', `go to /4.html`);
+ });
+
+ it('follow links with depth first order with maxDepth = 3', () => {
+ crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3 });
+ return crawler.onIdle()
+ .then(() => {
+ assert.equal(onSuccess.callCount, 4);
+ assert.equal(onSuccess.firstCall.args[0].depth, 1);
+ assert.equal(onSuccess.secondCall.args[0].depth, 2);
+ assert.equal(onSuccess.thirdCall.args[0].depth, 3);
+ });
+ });
+
+ it('follow links with breadth first order with maxDepth = 3 and depthPriority = false', () => {
+ crawler.queue({ url: `${PREFIX}/1.html`, maxDepth: 3, depthPriority: false });
+ return crawler.onIdle()
+ .then(() => {
+ assert.equal(onSuccess.callCount, 4);
+ assert.equal(onSuccess.firstCall.args[0].depth, 1);
+ assert.equal(onSuccess.secondCall.args[0].depth, 2);
+ assert.equal(onSuccess.thirdCall.args[0].depth, 2);
+ });
+ });
+ });
});
context('when the crawler is launched with maxRequest option', () => {
diff --git a/test/helper.test.js b/test/helper.test.js
index 65cfdbd6..cb7e259d 100644
--- a/test/helper.test.js
+++ b/test/helper.test.js
@@ -9,6 +9,7 @@ const {
escapeQuotes,
getRobotsUrl,
lowerBound,
+ checkDomainMatch,
getSitemapUrls,
unescape,
stringifyArgument,
@@ -217,6 +218,38 @@ describe('Helper', () => {
});
});
+ describe('Helper.checkDomainMatch', () => {
+ it('returns false for empty array', () => {
+ const actual = checkDomainMatch([], '127.0.0.1');
+ const expected = false;
+ assert.equal(actual, expected);
+ });
+
+ it('returns false when no domain fully matches requested hostname', () => {
+ const actual = checkDomainMatch(['localhost', '0.0.0.0'], '127.0.0.1');
+ const expected = false;
+ assert.equal(actual, expected);
+ });
+
+ it('returns false when no domain matches requested hostname by regular expression', () => {
+ const actual = checkDomainMatch([/^localhost$/, /^\d\.\d\.\d\.\d$/], '127.0.0.1');
+ const expected = false;
+ assert.equal(actual, expected);
+ });
+
+ it('returns true when a domain fully matches requested hostname', () => {
+ const actual = checkDomainMatch(['localhost', '127.0.0.1'], '127.0.0.1');
+ const expected = true;
+ assert.equal(actual, expected);
+ });
+
+ it('returns true when a domain fully matches requested hostname by regular expression', () => {
+ const actual = checkDomainMatch([/^localhost$/, /^\d+\.\d+\.\d+\.\d+$/], '127.0.0.1');
+ const expected = true;
+ assert.equal(actual, expected);
+ });
+ });
+
describe('Helper.getSitemapUrls', () => {
it('returns empty array for empty xml', () => {
const actual = getSitemapUrls('');