From 4f4a6d4c50738a803511e4c4a6eb88891ce8e8bb Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Sun, 10 Dec 2017 17:16:55 +0900 Subject: [PATCH] Improve docs and examples --- README.md | 110 +++++++++++++++--- examples/.eslintrc.js | 1 + examples/custom-skip.js | 25 ++++ examples/delay.js | 1 - examples/disable-jquery.js | 4 +- examples/emulate-device.js | 3 +- examples/pause-resume.js | 1 + examples/{session-cache.js => redis-cache.js} | 5 +- examples/skip-request.js | 23 ---- lib/cache/base.js | 10 ++ lib/cache/redis.js | 15 +++ lib/cache/session.js | 8 ++ lib/hccrawler.js | 61 ++++++---- test/hccrawler.test.js | 24 ++-- 14 files changed, 214 insertions(+), 77 deletions(-) create mode 100644 examples/custom-skip.js rename examples/{session-cache.js => redis-cache.js} (77%) delete mode 100644 examples/skip-request.js diff --git a/README.md b/README.md index edf64424..9e33cee8 100644 --- a/README.md +++ b/README.md @@ -5,11 +5,12 @@ Headless Chrome crawler with [jQuery](https://jquery.com) powered by [Puppeteer] Crawlers based on simple requests to html files are generally fast. However, it sometimes end up capturing empty bodies, especially when the websites are built on such modern frontend frameworks as AngularJS, ReactJS and Vue.js. -Powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer), headless-chrome-crawler allows you to scrape those single page applications with the following features: +Powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer), headless-chrome-crawler allows you to crawl those single page applications with the following features: * Configure concurrency, delay and retries * Pluggable cache to skip duplicate requests -* Cancel requests by conditions +* Cancel requests by custom conditions +* Restrict requests by domains * Pause and resume at any time * Insert [jQuery](https://jquery.com) automatically * Priority queue @@ -32,6 +33,8 @@ yarn add headless-chrome-crawler The basic API of headless-chrome-crawler is inspired by that of [node-crawler](https://github.com/bda-research/node-crawler), so the API design is somewhat similar but not exactly compatible. +**Example** - Queueing requests in different styles + ```js const HCCrawler = require('headless-chrome-crawler'); @@ -51,14 +54,14 @@ HCCrawler.launch({ crawler.queue('https://example.com/'); // Queue multiple requests crawler.queue(['https://example.net/', 'https://example.org/']); - // Queue a query custom options + // Queue a request with custom options crawler.queue({ jQuery: false, url: 'https://example.com/', + device: 'Nexus 6', evaluatePage: (() => ({ title: document.title, - h1: document.getElementsByTagName('h1')[0].innerText, - p: document.getElementsByTagName('p')[0].innerText + userAgent: window.navigator.userAgent, })), }); // Called when no queue is left @@ -67,6 +70,47 @@ HCCrawler.launch({ }); ``` +**Example** - Pause and resume with cache storage for large scale crawling + +```js +const HCCrawler = require('headless-chrome-crawler'); + +// Passing no options expects Redis to be run in the local machine with default port. +const cache = new HCCrawler.RedisCache(); + +function launch() { + return HCCrawler.launch({ + maxConcurrency: 1, + maxRequest: 2, + evaluatePage: (() => ({ + title: $('title').text(), + h1: $('h1').text(), + })), + onSuccess: (result => { + console.log('onSuccess', result); + }), + ensureClearCache: false, // Set false so that cache won't be cleared when closing the crawler + cache, + }); +} + +launch() + .then(crawler => { + crawler.queue({ url: 'https://example.com/' }); + crawler.queue({ url: 'https://example.net/' }); + crawler.queue({ url: 'https://example.org/' }); // The queue won't be requested due to maxRequest option + return crawler.onIdle() + .then(() => crawler.close()); // Close the crawler but cache won't be cleared + }) + .then(() => launch()) // Launch the crawler again + .then(crawler => { + crawler.queue({ url: 'https://example.net/' }); // This queue won't be requested because cache remains + crawler.queue({ url: 'https://example.org/' }); + return crawler.onIdle() + .then(() => crawler.close()); + }); +``` + ## Examples See [here](https://github.com/yujiosaka/headless-chrome-crawler/tree/master/examples). @@ -78,7 +122,11 @@ See [here](https://github.com/yujiosaka/headless-chrome-crawler/tree/master/exam * [class: HCCrawler](#class-hccrawler) * [HCCrawler.connect([options])](#hccrawlerconnectoptions) * [HCCrawler.launch([options])](#hccrawlerlaunchoptions) + * [HCCrawler.executablePath()](#hccrawlerexecutablepath) * [crawler.queue([options])](#crawlerqueueoptions) + * [crawler.setMaxRequest(maxRequest)](#crawlersetmaxrequestmaxrequest) + * [crawler.pause()](#crawlerpause) + * [crawler.resume()](#crawlerresume) * [crawler.close()](#crawlerclose) * [crawler.disconnect()](#crawlerdisconnect) * [crawler.version()](#crawlerversion) @@ -90,7 +138,7 @@ See [here](https://github.com/yujiosaka/headless-chrome-crawler/tree/master/exam ### class: HCCrawler -HCCrawler provides method to launch or connect to a HeadlessChrome/Chromium. It also provides a `HCCrawler.executablePath()` method which behaves the same as [puppeteer.executablePath()](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerexecutablepath). +HCCrawler provides method to launch or connect to a HeadlessChrome/Chromium. #### HCCrawler.connect([options]) @@ -111,7 +159,7 @@ Also, the following options can be set as default values when [crawler.queue([op url, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, password, shouldRequest, evaluatePage, onSuccess, onError ``` -> **Note**: In practice, setting the options every time you queue the requests is not only redundant but also slow. Therefore, it's recommended to set the default values and override them depending on the necessity. +> **Note**: In practice, setting the options every time you queue equests is not only redundant but also slow. Therefore, it's recommended to set the default values and override them depending on the necessity. #### HCCrawler.launch([options]) @@ -134,21 +182,29 @@ url, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, > **Note**: In practice, setting the options every time you queue the requests is not only redundant but also slow. Therefore, it's recommended to set the default values and override them depending on the necessity. +#### HCCrawler.executablePath() + +* returns: An expected path to find bundled Chromium. + +See [puppeteer.executablePath()](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerexecutablepath) for more details. + #### crawler.queue([options]) * `options` <[Object]> * `url` <[String]> Url to navigate to. The url should include scheme, e.g. `https://`. - * `priority` <[number]> Basic priority of queues, defaults to `1`. Queues with larger priorities are preferred. - * `allowedDomains` <[Array]> List of domains that the crawler is allowed to request. `www.example.com` will be allowed if 'example.com' is added. + * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred. + * `allowedDomains` <[Array]> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed. * `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, maxConcurrency must be `1`. * `retryCount` <[number]> Number of limit when retry fails, defaults to `3`. * `retryDelay` <[number]> Number of milliseconds after each retry fails, defaults to `10000`. * `jQuery` <[boolean]> Whether to automatically add [jQuery](https://jquery.com) tag to page, defaults to `true`. * `device` <[String]> Device to emulate. Available devices are listed [here](https://github.com/GoogleChrome/puppeteer/blob/master/DeviceDescriptors.js). - * `username` <[String]> Username required for Basic Authentication. pass `null` if it's not necessary. - * `password` <[String]> Password required for Basic Authentication. pass `null` if it's not necessary. + * `username` <[String]> Username Basic Authentication. pass `null` if it's not necessary. + * `password` <[String]> Password Basic Authentication. pass `null` if it's not necessary. * `userAgent` <[String]> User agent string to use in this page. * `extraHeaders` <[Object]> An object containing additional http headers to be sent with every request. All header values must be strings. + * `cache` <[Cache]> A cache object which extends BaseCache to remember and skip duplicate requests, defaults to `SessionCache`. Pass `null` if you don't want to skip duplicate requests. + * `ensureClearCache` <[boolean]> Whether to clear cache on closing or disconnecting from the browser, defaults to `true`. * `preRequest(options)` <[Function]> Function to do anything like waiting and modifying options before each request. You can also return `false` if you want to skip the request. * `options` <[Object]> [crawler.queue([options])](#crawlerqueueoptions)'s options with default values. * `evaluatePage()` <[Function]> Function to be evaluated in browsers. Return serializable object. If it's not serializable, the result will be `undefined`. @@ -159,15 +215,33 @@ url, timeout, priority, delay, retryCount, retryDelay, jQuery, device, username, * `status` <[String]> status code of the request. * `url` <[String]> Last requested url. * `headers` <[Object]> Response headers. - * `options` <[Object]> crawler.queue([options])](#crawlerqueueoptions)'s options with default values. - * `result` <[Serializable]> The result resolved from `evaluatePage()`. + * `options` <[Object]> [crawler.queue([options])](#crawlerqueueoptions)'s options with default values. + * `result` <[Serializable]> The result resolved from `evaluatePage()` option. * `onError(error)` <[Function]> Function to be called when request fails. * `error` <[Error]> Error object. > **Note**: `response.url` may be different from `options.url` especially when the requested url is redirected. +The following options are passed straight to [Puppeteer's page.goto(url, options)](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#pagegotourl-options)'s options'. + +``` +timeout, waitUntil +``` + The options can be either an object, an array, or a string. When it's an array, each item in the array will be executed. When it's a string, the options are transformed to an object with only url defined. +#### crawler.setMaxRequest(maxRequest) + +This method allows you to modify `maxRequest` option you passed to [HCCrawler.connect([options])](#hccrawlerconnectoptions) or [HCCrawler.launch([options])](#hccrawlerlaunchoptions). + +#### crawler.pause() + +This method allows you to pause processing queues. You can resume the queue by calling [crawler.resume()](#crawlerresume). + +#### crawler.resume() + +This method allows you to resume processing queues. This method may be used after the crawler is intentionally closed by calling [crawler.pause()](#crawlerpause) or request count reached `maxRequest` option. + #### crawler.close() returns: <[Promise]> Promise which is resolved when ther browser is closed. @@ -194,7 +268,11 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer #### crawler.onIdle() -- returns: <[Promise]> Promise which is resolved when queues become empty. +- returns: <[Promise]> Promise which is resolved when queues become empty or paused. + +#### crawler.isPaused + +* returns: <[boolean]> Whether the queue is paused. This property is read only. #### crawler.queueSize @@ -208,6 +286,10 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer * returns: <[number]> The count of total requests. This property is read only. +#### crawler.cache + +* returns: <[number]> The cache set to skip duplicate requests. This property is read only. + ## Debugging tips ### Launch options diff --git a/examples/.eslintrc.js b/examples/.eslintrc.js index 15d68fad..32902a18 100644 --- a/examples/.eslintrc.js +++ b/examples/.eslintrc.js @@ -2,5 +2,6 @@ module.exports = { "extends": "../.eslintrc.js", 'globals': { '$': true, + 'window': true, } }; diff --git a/examples/custom-skip.js b/examples/custom-skip.js new file mode 100644 index 00000000..671777a4 --- /dev/null +++ b/examples/custom-skip.js @@ -0,0 +1,25 @@ +const HCCrawler = require('../'); + +HCCrawler.launch({ + cache: null, // Disable default session cache + maxConcurrency: 1, + evaluatePage: (() => ({ + title: $('title').text(), + h1: $('h1').text(), + })), + onSuccess: (result => { + console.log('onSuccess', result); + }), + preRequest: (options => { + if (options.customSkip) return false; + return true; + }), +}) + .then(crawler => { + // You can set custom option to be used in preRequest arguments + crawler.queue({ url: 'https://example.com/', customSkip: false }); + crawler.queue({ url: 'https://example.com/', customSkip: false }); // This queue will be requested because cache is disabled + crawler.queue({ url: 'https://example.net/', customSkip: true }); // This queue won't be requrested because preRequest function returns false + crawler.onIdle() + .then(() => crawler.close()); + }); diff --git a/examples/delay.js b/examples/delay.js index fa180e2f..a0eefe8b 100644 --- a/examples/delay.js +++ b/examples/delay.js @@ -14,7 +14,6 @@ HCCrawler.launch({ .then(crawler => { crawler.queue({ url: 'https://example.com/' }); crawler.queue({ url: 'https://example.net/' }); - crawler.queue({ url: 'https://example.org/' }); crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/disable-jquery.js b/examples/disable-jquery.js index 0e71a2a1..d6514187 100644 --- a/examples/disable-jquery.js +++ b/examples/disable-jquery.js @@ -4,14 +4,16 @@ HCCrawler.launch({ jQuery: false, // jQuery script tag won't be added retryCount: 3, // Retry the same request up to 3 times retryDelay: 1000, // Wait 1000msecs before each retry + // $ is undefined so that causes an error evaluatePage: (() => ({ - // $ is undefined so that causes an error title: $('title').text(), h1: $('h1').text(), })), + // Should not be called because evaluatePage causes an error onSuccess: (result => { console.log('onSuccess', result); }), + // Catch the error caused on evaluatePage onError: (err => { console.error('onError', err); }), diff --git a/examples/emulate-device.js b/examples/emulate-device.js index cae9d1a4..6a50e747 100644 --- a/examples/emulate-device.js +++ b/examples/emulate-device.js @@ -3,7 +3,7 @@ const HCCrawler = require('../'); HCCrawler.launch({ evaluatePage: (() => ({ title: $('title').text(), - h1: $('h1').text(), + userAgent: window.navigator.userAgent, })), onSuccess: (result => { console.log('onSuccess', result); @@ -12,6 +12,7 @@ HCCrawler.launch({ .then(crawler => { crawler.queue({ url: 'https://example.com/', device: 'iPhone 6 Plus' }); crawler.queue({ url: 'https://example.com/', device: 'Nexus 7' }); + crawler.queue({ url: 'https://example.com/', userAgent: 'Awesome Crawler' }); // Only override userAgent crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/pause-resume.js b/examples/pause-resume.js index b1c49b77..47a29e5e 100644 --- a/examples/pause-resume.js +++ b/examples/pause-resume.js @@ -17,6 +17,7 @@ HCCrawler.launch({ crawler.queue({ url: 'https://example.org/' }); // The queue won't be requested until resumed crawler.onIdle() .then(() => { + // Lift the max request limit so that it doesn't right after resume called crawler.setMaxRequest(3); crawler.resume(); return crawler.onIdle(); diff --git a/examples/session-cache.js b/examples/redis-cache.js similarity index 77% rename from examples/session-cache.js rename to examples/redis-cache.js index a449fef0..8439754b 100644 --- a/examples/session-cache.js +++ b/examples/redis-cache.js @@ -1,5 +1,8 @@ const HCCrawler = require('../'); +// Passing no options expects Redis to be run in the local machine with default port. +const cache = new HCCrawler.RedisCache(); + HCCrawler.launch({ maxConcurrency: 1, evaluatePage: (() => ({ @@ -9,7 +12,7 @@ HCCrawler.launch({ onSuccess: (result => { console.log('onSuccess', result); }), - cache: new HCCrawler.SessionCache(), + cache, }) .then(crawler => { crawler.queue('https://example.com/'); diff --git a/examples/skip-request.js b/examples/skip-request.js deleted file mode 100644 index 4d4bedfd..00000000 --- a/examples/skip-request.js +++ /dev/null @@ -1,23 +0,0 @@ -const HCCrawler = require('../'); - -HCCrawler.launch({ - maxConcurrency: 1, - evaluatePage: (() => ({ - title: $('title').text(), - h1: $('h1').text(), - p: $('p').text(), - })), - onSuccess: (result => { - console.log('onSuccess', result); - }), - preRequest: (options => { - if (options.url === 'https://example.net/') return false; - return true; - }), -}) - .then(crawler => { - crawler.queue('https://example.com/'); - crawler.queue('https://example.net/'); - crawler.onIdle() - .then(() => crawler.close()); - }); diff --git a/lib/cache/base.js b/lib/cache/base.js index 0cf1b67d..c7f954c9 100644 --- a/lib/cache/base.js +++ b/lib/cache/base.js @@ -71,6 +71,16 @@ class BaseCache { throw new Error('Set is not overridden!'); } + /** + * Method to remove already requested option from the cache storage + * @param {Object} options + * @return {Promise} resolves when remove operation completed + * @interface + */ + remove() { + throw new Error('Remove is not overridden!'); + } + /** * Method to check whether the requested options already exists in the cache storage * @param {Object} options diff --git a/lib/cache/redis.js b/lib/cache/redis.js index d78dd083..37c17e2d 100644 --- a/lib/cache/redis.js +++ b/lib/cache/redis.js @@ -62,6 +62,21 @@ class RedisCache extends BaseCache { }); }); } + + /** + * @override + */ + remove(options) { + return new Promise((resolve, reject) => { + this._client.del(BaseCache.key(options), error => { + if (error) { + reject(error); + return; + } + resolve(); + }); + }); + } } module.exports = RedisCache; diff --git a/lib/cache/session.js b/lib/cache/session.js index 17a67b1b..8a181b9f 100644 --- a/lib/cache/session.js +++ b/lib/cache/session.js @@ -39,6 +39,14 @@ class SessionCache extends BaseCache { this._storage[BaseCache.key(options)] = true; return Promise.resolve(); } + + /** + * @override + */ + remove(options) { + delete this._storage[BaseCache.key(options)]; + return Promise.resolve(); + } } module.exports = SessionCache; diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 0bd01b4a..4b797bb4 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -113,7 +113,8 @@ class HCCrawler { retryCount: 3, retryDelay: 10000, jQuery: true, - clearCacheOnEnd: true, + cache: new SessionCache(), + ensureClearCache: true, }, options); this._pQueue = new PQueue({ concurrency: this._options.maxConcurrency, @@ -143,7 +144,7 @@ class HCCrawler { close() { return Promise.all([ this._browser.close(), - this._clearCacheOnEnd().then(() => this.closeCache()), + this._clearCacheOnEnd().then(() => this._closeCache()), ]); } @@ -154,7 +155,7 @@ class HCCrawler { disconnect() { return Promise.all([ this._browser.disconnect(), - this._clearCacheOnEnd().then(() => this.closeCache()), + this._clearCacheOnEnd().then(() => this._closeCache()), ]); } @@ -200,22 +201,6 @@ class HCCrawler { return this._pQueue.start(); } - /** - * @return {Promise} resolved when cache has been cleared - */ - clearCache() { - if (!this.cache) return Promise.resolve(); - return this.cache.clear(); - } - - /** - * @return {Promise} resolved when cache has been closed - */ - closeCache() { - if (!this.cache) return Promise.resolve(); - return this.cache.close(); - } - /** * Get paused status * @return {bolean} paused @@ -305,12 +290,14 @@ class HCCrawler { return this._newPage(options) .then(crawler => crawler.crawl()) .then(() => delay(options.delay)) - .then(() => this._checkRequestCount()); + .then(() => void this._checkRequestCount()); }) .catch(err => { if (retryCount >= options.retryCount) throw new Error(`Retry give-up for requesting ${options.url}!`, err); debugRequest(`Retry requesting ${options.url} ${retryCount + 1} times`); - return delay(options.retryDelay).then(() => this._request(options, retryCount + 1)); + return delay(options.retryDelay) + .then(() => this._removeExists(options)) + .then(() => this._request(options, retryCount + 1)); }) .catch(err => { debugRequest(`Retry give-up for requesting ${options.url} after ${retryCount} tries`); @@ -340,6 +327,16 @@ class HCCrawler { .then(exists => this.cache.set(options).then(() => exists)); } + /** + * @param {Object} options + * @return {Promise} resolved when already accessed options are removed + * @private + */ + _removeExists(options) { + if (!this.cache) return Promise.resolve(false); + return this.cache.remove(options); + } + /** * @param {Puppeteer.Page} page * @param {Object} options @@ -367,9 +364,7 @@ class HCCrawler { this._requestedCount += 1; if (this._options.maxRequest && this._requestedCount >= this._options.maxRequest) { this.pause(); - return this._clearCacheOnEnd(); } - return Promise.resolve(); } /** @@ -377,9 +372,27 @@ class HCCrawler { * @private */ _clearCacheOnEnd() { - if (this._options.clearCacheOnEnd) return this.clearCache(); + if (this._options.ensureClearCache) return this._clearCache(); return Promise.resolve(); } + + /** + * @return {Promise} resolved when cache has been cleared + * @private + */ + _clearCache() { + if (!this.cache) return Promise.resolve(); + return this.cache.clear(); + } + + /** + * @return {Promise} resolved when cache has been closed + * @private + */ + _closeCache() { + if (!this.cache) return Promise.resolve(); + return this.cache.close(); + } } module.exports = HCCrawler; diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js index 46871ddc..3caae180 100644 --- a/test/hccrawler.test.js +++ b/test/hccrawler.test.js @@ -21,14 +21,14 @@ describe('HCCrawler', () => { }); context('when launched without necessary options', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch() .then(_crawler => { crawler = _crawler; }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('throws error when queueing null', () => { assert.throws(() => { @@ -66,7 +66,7 @@ describe('HCCrawler', () => { }); context('when launched with necessary options', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -77,7 +77,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('crawls with single string options', () => { assert.doesNotThrow(() => { @@ -207,7 +207,7 @@ describe('HCCrawler', () => { }); context('when launched with maxConcurrency: 1', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -218,7 +218,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('obeys priority order', () => { assert.doesNotThrow(() => { @@ -292,7 +292,7 @@ describe('HCCrawler', () => { }); context('when launched with session cache', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -305,7 +305,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('does not requested already cached url', () => { assert.doesNotThrow(() => { @@ -321,7 +321,7 @@ describe('HCCrawler', () => { }); context('when launched with redis cache', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -334,7 +334,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('does not requested already cached url', () => { assert.doesNotThrow(() => { @@ -356,7 +356,7 @@ describe('HCCrawler', () => { }); context('when launched with necessary options', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -366,7 +366,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('retries and gives up', () => { assert.doesNotThrow(() => {