From c394cd7c3f68514a2d9f1432583f2730f70bc7b4 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Sun, 10 Dec 2017 12:37:39 +0900 Subject: [PATCH 1/2] Add pluggable cache --- README.md | 18 +- examples/delay.js | 7 +- examples/disable-jquery.js | 3 +- examples/emulate-device.js | 6 +- examples/multiple-queue.js | 5 +- examples/override-function.js | 3 +- examples/pause-resume.js | 25 +++ examples/priority-queue.js | 7 +- examples/session-cache.js | 20 +++ .../{skip-duplicates.js => skip-request.js} | 10 +- lib/cache/base.js | 86 ++++++++++ lib/cache/index.js | 3 + lib/cache/redis.js | 67 ++++++++ lib/cache/session.js | 44 +++++ lib/hccrawler.js | 157 +++++++++++++++--- lib/helper.js | 29 ++++ package.json | 3 +- test/hccrawler.test.js | 83 ++++++++- test/helper.test.js | 42 ++++- yarn.lock | 20 +++ 20 files changed, 574 insertions(+), 64 deletions(-) create mode 100644 examples/pause-resume.js create mode 100644 examples/session-cache.js rename examples/{skip-duplicates.js => skip-request.js} (58%) create mode 100644 lib/cache/base.js create mode 100644 lib/cache/index.js create mode 100644 lib/cache/redis.js create mode 100644 lib/cache/session.js diff --git a/README.md b/README.md index 484b18b8..e4b6971b 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,9 @@ Crawlers based on simple requests to html files are generally fast. However, it Powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer), headless-chrome-crawler allows you to scrape those single page applications with the following features: * Configure concurrency, delay and retries +* Pluggable cache to skip duplicate requests * Cancel requests by conditions +* Pause and resume at any time * Insert [jQuery](https://jquery.com) automatically * Priority queue * Device emulation @@ -34,24 +36,25 @@ The basic API of headless-chrome-crawler is inspired by that of [node-crawler](h const HCCrawler = require('headless-chrome-crawler'); HCCrawler.launch({ + // Function to be evaluated in browsers evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), + // Function to be called with evaluated results from browsers onSuccess: (result => { - console.log('onSuccess', result); // resolves status, options and evaluated result. + console.log('onSuccess', result); }), }) .then(crawler => { // Queue a single request - crawler.queue('https://example.com'); + crawler.queue('https://example.com/'); // Queue multiple requests - crawler.queue(['https://example.net', 'https://example.org']); + crawler.queue(['https://example.net/', 'https://example.org/']); // Queue a query custom options crawler.queue({ jQuery: false, - url: 'https://example.com', + url: 'https://example.com/', evaluatePage: (() => ({ title: document.title, h1: document.getElementsByTagName('h1')[0].innerText, @@ -81,7 +84,6 @@ See [here](https://github.com/yujiosaka/headless-chrome-crawler/tree/master/exam * [crawler.version()](#crawlerversion) * [crawler.wsEndpoint()](#crawlerwsendpoint) * [crawler.onIdle()](#crawleronidle) - * [crawler.onEnd()](#crawleronend) * [crawler.queueSize](#crawlerqueuesize) * [crawler.pendingQueueSize](#crawlerpendingqueuesize) * [crawler.requestedCount](#crawlerrequestedcount) @@ -193,10 +195,6 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer - returns: <[Promise]> Promise which is resolved when queues become empty. -#### crawler.onEnd() - -- returns: <[Promise]> Promise which is resolved when request reaches max. - #### crawler.queueSize * returns: <[number]> The size of queues. This property is read only. diff --git a/examples/delay.js b/examples/delay.js index aa5af491..fa180e2f 100644 --- a/examples/delay.js +++ b/examples/delay.js @@ -6,16 +6,15 @@ HCCrawler.launch({ evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); }), }) .then(crawler => { - crawler.queue({ url: 'https://example.com' }); - crawler.queue({ url: 'https://example.net' }); - crawler.queue({ url: 'https://example.org' }); + crawler.queue({ url: 'https://example.com/' }); + crawler.queue({ url: 'https://example.net/' }); + crawler.queue({ url: 'https://example.org/' }); crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/disable-jquery.js b/examples/disable-jquery.js index 498b5b6a..0e71a2a1 100644 --- a/examples/disable-jquery.js +++ b/examples/disable-jquery.js @@ -8,7 +8,6 @@ HCCrawler.launch({ // $ is undefined so that causes an error title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); @@ -18,7 +17,7 @@ HCCrawler.launch({ }), }) .then(crawler => { - crawler.queue('https://example.com'); + crawler.queue('https://example.com/'); crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/emulate-device.js b/examples/emulate-device.js index 1c07a81d..cae9d1a4 100644 --- a/examples/emulate-device.js +++ b/examples/emulate-device.js @@ -4,16 +4,14 @@ HCCrawler.launch({ evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); }), }) .then(crawler => { - crawler.queue({ url: 'https://example.com', device: 'iPhone 6 Plus' }); - crawler.queue({ url: 'https://example.com', device: 'iPad' }); - crawler.queue({ url: 'https://example.com', device: 'Nexus 7' }); + crawler.queue({ url: 'https://example.com/', device: 'iPhone 6 Plus' }); + crawler.queue({ url: 'https://example.com/', device: 'Nexus 7' }); crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/multiple-queue.js b/examples/multiple-queue.js index 456a8c9f..433919b7 100644 --- a/examples/multiple-queue.js +++ b/examples/multiple-queue.js @@ -4,15 +4,14 @@ HCCrawler.launch({ evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); }), }) .then(crawler => { - crawler.queue('https://example.com'); // one URL - crawler.queue(['https://example.net', { url: 'https://example.org' }]); // multiple URLs in different styles. + crawler.queue('https://example.com/'); // one URL + crawler.queue(['https://example.net/', { url: 'https://example.org/' }]); // multiple URLs in different styles. crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/override-function.js b/examples/override-function.js index 61439b82..7814d2d5 100644 --- a/examples/override-function.js +++ b/examples/override-function.js @@ -11,11 +11,10 @@ HCCrawler.launch({ }) .then(crawler => { crawler.queue({ - url: 'https://example.com', + url: 'https://example.com/', evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); diff --git a/examples/pause-resume.js b/examples/pause-resume.js new file mode 100644 index 00000000..b1c49b77 --- /dev/null +++ b/examples/pause-resume.js @@ -0,0 +1,25 @@ +const HCCrawler = require('../'); + +HCCrawler.launch({ + maxConcurrency: 1, + maxRequest: 2, + evaluatePage: (() => ({ + title: $('title').text(), + h1: $('h1').text(), + })), + onSuccess: (result => { + console.log('onSuccess', result); + }), +}) + .then(crawler => { + crawler.queue({ url: 'https://example.com/' }); + crawler.queue({ url: 'https://example.net/' }); + crawler.queue({ url: 'https://example.org/' }); // The queue won't be requested until resumed + crawler.onIdle() + .then(() => { + crawler.setMaxRequest(3); + crawler.resume(); + return crawler.onIdle(); + }) + .then(() => crawler.close()); + }); diff --git a/examples/priority-queue.js b/examples/priority-queue.js index ac142ae5..6cde1ed3 100644 --- a/examples/priority-queue.js +++ b/examples/priority-queue.js @@ -5,16 +5,15 @@ HCCrawler.launch({ evaluatePage: (() => ({ title: $('title').text(), h1: $('h1').text(), - p: $('p').text(), })), onSuccess: (result => { console.log('onSuccess', result); }), }) .then(crawler => { - crawler.queue({ url: 'https://example.com' }); // First queue will be requested first regardless of priority - crawler.queue({ url: 'https://example.net', priority: 1 }); - crawler.queue({ url: 'https://example.org', priority: 2 }); // This queue is requested before the previous queue + crawler.queue({ url: 'https://example.com/' }); // First queue will be requested first regardless of priority + crawler.queue({ url: 'https://example.net/', priority: 1 }); + crawler.queue({ url: 'https://example.org/', priority: 2 }); // This queue is requested before the previous queue crawler.onIdle() .then(() => crawler.close()); }); diff --git a/examples/session-cache.js b/examples/session-cache.js new file mode 100644 index 00000000..a449fef0 --- /dev/null +++ b/examples/session-cache.js @@ -0,0 +1,20 @@ +const HCCrawler = require('../'); + +HCCrawler.launch({ + maxConcurrency: 1, + evaluatePage: (() => ({ + title: $('title').text(), + h1: $('h1').text(), + })), + onSuccess: (result => { + console.log('onSuccess', result); + }), + cache: new HCCrawler.SessionCache(), +}) + .then(crawler => { + crawler.queue('https://example.com/'); + crawler.queue('https://example.net/'); + crawler.queue('https://example.com/'); // The queue won't be requested + crawler.onIdle() + .then(() => crawler.close()); + }); diff --git a/examples/skip-duplicates.js b/examples/skip-request.js similarity index 58% rename from examples/skip-duplicates.js rename to examples/skip-request.js index 3fa775d1..4d4bedfd 100644 --- a/examples/skip-duplicates.js +++ b/examples/skip-request.js @@ -1,7 +1,5 @@ const HCCrawler = require('../'); -const requestedObj = {}; - HCCrawler.launch({ maxConcurrency: 1, evaluatePage: (() => ({ @@ -10,18 +8,16 @@ HCCrawler.launch({ p: $('p').text(), })), onSuccess: (result => { - requestedObj[result.options.url] = true; console.log('onSuccess', result); }), preRequest: (options => { - if (requestedObj[options.url]) return false; + if (options.url === 'https://example.net/') return false; return true; }), }) .then(crawler => { - crawler.queue('https://example.com'); - crawler.queue('https://example.net'); - crawler.queue('https://example.com'); // The queue won't be requested + crawler.queue('https://example.com/'); + crawler.queue('https://example.net/'); crawler.onIdle() .then(() => crawler.close()); }); diff --git a/lib/cache/base.js b/lib/cache/base.js new file mode 100644 index 00000000..0cf1b67d --- /dev/null +++ b/lib/cache/base.js @@ -0,0 +1,86 @@ +const _ = require('lodash'); +const { hash, jsonStableReplacer } = require('../helper'); + +const OMITTED_HASH_FIELDS = [ + 'priority', + 'allowedDomains', + 'delay', + 'retryCount', + 'retryDelay', + 'jQuery', + 'username', + 'password', + 'preRequest', + 'evaluatePage', + 'onSuccess', + 'onError', + 'timeout', + 'waitUntil', +]; +const MAX_LENGTH = 10; + +class BaseCache { + constructor(settings) { + this._settings = settings; + } + + /** + * Initializing the cache storage + * @return {Promise} resolves when init operation completed + * @interface + */ + init() { + throw new Error('Init is not overridden!'); + } + + /** + * Closing the cache storage + * @return {Promise} resolves when close operation completed + * @interface + */ + close() { + throw new Error('Close is not overridden!'); + } + + /** + * Clearing the cache storage + * @return {Promise} resolves when clear operation completed + * @interface + */ + clear() { + throw new Error('Clear is not overridden!'); + } + + /** + * Method to check whether the requested options already exists in the cache storage + * @param {Object} options + * @return {Promise} resolves whether the requested options already exists + * @interface + */ + exists() { + throw new Error('Get is not overridden!'); + } + + /** + * Method to set the requested options to the cache storage + * @param {Object} options + * @return {Promise} resolves when set operation completed + * @interface + */ + set() { + throw new Error('Set is not overridden!'); + } + + /** + * Method to check whether the requested options already exists in the cache storage + * @param {Object} options + * @return {String} session cache key for the option + * @static + */ + static key(options) { + const json = JSON.stringify(_.omit(options, OMITTED_HASH_FIELDS), jsonStableReplacer); + return hash(json).substring(0, MAX_LENGTH); + } +} + +module.exports = BaseCache; diff --git a/lib/cache/index.js b/lib/cache/index.js new file mode 100644 index 00000000..46e56262 --- /dev/null +++ b/lib/cache/index.js @@ -0,0 +1,3 @@ +exports.BaseCache = require('./base'); +exports.SessionCache = require('./session'); +exports.RedisCache = require('./redis'); diff --git a/lib/cache/redis.js b/lib/cache/redis.js new file mode 100644 index 00000000..d78dd083 --- /dev/null +++ b/lib/cache/redis.js @@ -0,0 +1,67 @@ +const BaseCache = require('./base'); +const redis = require('redis'); + +class RedisCache extends BaseCache { + /** + * @override + */ + init() { + this._client = redis.createClient(this._settings); + return Promise.resolve(); + } + + /** + * @override + */ + clear() { + return new Promise((resolve, reject) => { + this._client.flushdb(error => { + if (error) { + reject(error); + return; + } + resolve(); + }); + }); + } + + /** + * @override + */ + close() { + this._client.quit(); + return Promise.resolve(); + } + + /** + * @override + */ + exists(options) { + return new Promise((resolve, reject) => { + this._client.exists(BaseCache.key(options), (error, exists) => { + if (error) { + reject(error); + return; + } + resolve(exists); + }); + }); + } + + /** + * @override + */ + set(options) { + return new Promise((resolve, reject) => { + this._client.set(BaseCache.key(options), '1', error => { + if (error) { + reject(error); + return; + } + resolve(); + }); + }); + } +} + +module.exports = RedisCache; diff --git a/lib/cache/session.js b/lib/cache/session.js new file mode 100644 index 00000000..17a67b1b --- /dev/null +++ b/lib/cache/session.js @@ -0,0 +1,44 @@ +const BaseCache = require('./base'); + +class SessionCache extends BaseCache { + /** + * @override + */ + init() { + this._storage = {}; + return Promise.resolve(); + } + + /** + * @override + */ + clear() { + this._storage = {}; + return Promise.resolve(); + } + + /** + * @override + */ + close() { + this._storage = {}; + return Promise.resolve(); + } + + /** + * @override + */ + exists(options) { + return Promise.resolve(this._storage[BaseCache.key(options)] || false); + } + + /** + * @override + */ + set(options) { + this._storage[BaseCache.key(options)] = true; + return Promise.resolve(); + } +} + +module.exports = SessionCache; diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 26fe920a..0bd01b4a 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -5,6 +5,7 @@ const Puppeteer = require('puppeteer'); const devices = require('puppeteer/DeviceDescriptors'); const Crawler = require('./crawler'); const { delay, debugRequest } = require('./helper'); +const { BaseCache, SessionCache, RedisCache } = require('./cache'); const PUPPETEER_CONNECT_OPTIONS = [ 'browserWSEndpoint', @@ -28,11 +29,43 @@ const PUPPETEER_LAUNCH_OPTIONS = [ const HCCRAWLER_OPTIONS = [ 'maxConcurrency', 'maxRequest', + 'cache', + 'clearCacheOnEnd', ]; const deviceNames = Object.keys(devices); class HCCrawler { + /** + * Get the base cache class + * @return {BaseCache} base cache + * @readonly + * @static + */ + static get BaseCache() { + return BaseCache; + } + + /** + * Get the base session class + * @return {SessionCache} base cache + * @readonly + * @static + */ + static get SessionCache() { + return SessionCache; + } + + /** + * Get the base Redis class + * @return {RedisCache} Redis cache + * @readonly + * @static + */ + static get RedisCache() { + return RedisCache; + } + /** * Connect to an existing Chromium instance * @param {Object} options @@ -41,7 +74,8 @@ class HCCrawler { */ static connect(options) { return Puppeteer.connect(_.pick(options, PUPPETEER_CONNECT_OPTIONS)) - .then(browser => new HCCrawler(browser, _.omit(options, PUPPETEER_CONNECT_OPTIONS))); + .then(browser => new HCCrawler(browser, _.omit(options, PUPPETEER_CONNECT_OPTIONS))) + .then(crawler => crawler._init().then(() => crawler)); } /** @@ -52,7 +86,8 @@ class HCCrawler { */ static launch(options) { return Puppeteer.launch(_.pick(options, PUPPETEER_LAUNCH_OPTIONS)) - .then(browser => new HCCrawler(browser, _.omit(options, PUPPETEER_LAUNCH_OPTIONS))); + .then(browser => new HCCrawler(browser, _.omit(options, PUPPETEER_LAUNCH_OPTIONS))) + .then(crawler => crawler._init().then(() => crawler)); } /** @@ -78,14 +113,12 @@ class HCCrawler { retryCount: 3, retryDelay: 10000, jQuery: true, + clearCacheOnEnd: true, }, options); this._pQueue = new PQueue({ concurrency: this._options.maxConcurrency, }); this._requestedCount = 0; - this._resolveOnEnd = () => { - this._pQueue.pause(); - }; } /** @@ -108,7 +141,10 @@ class HCCrawler { * @return {Promise} resolved when ther crawler is closed */ close() { - return this._browser.close(); + return Promise.all([ + this._browser.close(), + this._clearCacheOnEnd().then(() => this.closeCache()), + ]); } /** @@ -116,7 +152,10 @@ class HCCrawler { * @return {Promise} resolved when ther crawler disconnected */ disconnect() { - return this._browser.disconnect(); + return Promise.all([ + this._browser.disconnect(), + this._clearCacheOnEnd().then(() => this.closeCache()), + ]); } /** @@ -141,16 +180,49 @@ class HCCrawler { } /** - * @return {Promise} resolved when reached the max request + * Set max request option after launch */ - onEnd() { - return new Promise(resolve => { - const oldResolveOnEnd = this._resolveOnEnd; - this._resolveOnEnd = () => { - oldResolveOnEnd(); - resolve(); - }; - }); + setMaxRequest(maxRequest) { + this._options.maxRequest = maxRequest; + } + + /** + * Pause request temporary + */ + pause() { + return this._pQueue.pause(); + } + + /** + * Resume request temporary + */ + resume() { + return this._pQueue.start(); + } + + /** + * @return {Promise} resolved when cache has been cleared + */ + clearCache() { + if (!this.cache) return Promise.resolve(); + return this.cache.clear(); + } + + /** + * @return {Promise} resolved when cache has been closed + */ + closeCache() { + if (!this.cache) return Promise.resolve(); + return this.cache.close(); + } + + /** + * Get paused status + * @return {bolean} paused + * @readonly + */ + get isPaused() { + return this._pQueue.isPaused(); } /** @@ -180,6 +252,24 @@ class HCCrawler { return this._requestedCount; } + /** + * Get the cache storage + * @return {Cache} cache storage + * @readonly + */ + get cache() { + return this._options.cache; + } + + /** + * @return {Promise} resolved when initialization completed + * @private + */ + _init() { + if (!this.cache) return Promise.resolve(); + return this.cache.init(); + } + /** * @param {Object} options * @private @@ -203,16 +293,19 @@ class HCCrawler { debugRequest(`Skip requesting ${options.url}`); return Promise.resolve(); } - return this._preRequest(options) - .then(shouldRequest => { - if (!shouldRequest) { + return Promise.all([ + this._checkExists(options), + this._preRequest(options), + ]) + .then(([exists, shouldRequest]) => { + if (exists || !shouldRequest) { debugRequest(`Skip requesting ${options.url}`); return Promise.resolve(); } return this._newPage(options) .then(crawler => crawler.crawl()) .then(() => delay(options.delay)) - .then(() => void this._checkRequestCount()); + .then(() => this._checkRequestCount()); }) .catch(err => { if (retryCount >= options.retryCount) throw new Error(`Retry give-up for requesting ${options.url}!`, err); @@ -236,6 +329,17 @@ class HCCrawler { return _.some(options.allowedDomains, domain => _.endsWith(hostname, domain)); } + /** + * @param {Object} options + * @return {Promise} whether the requested options already exists in the cache storage + * @private + */ + _checkExists(options) { + if (!this.cache) return Promise.resolve(false); + return this.cache.exists(options) + .then(exists => this.cache.set(options).then(() => exists)); + } + /** * @param {Puppeteer.Page} page * @param {Object} options @@ -262,8 +366,19 @@ class HCCrawler { _checkRequestCount() { this._requestedCount += 1; if (this._options.maxRequest && this._requestedCount >= this._options.maxRequest) { - this._resolveOnEnd(); + this.pause(); + return this._clearCacheOnEnd(); } + return Promise.resolve(); + } + + /** + * @return {Promise} resolved when clear cache + * @private + */ + _clearCacheOnEnd() { + if (this._options.clearCacheOnEnd) return this.clearCache(); + return Promise.resolve(); } } diff --git a/lib/helper.js b/lib/helper.js index 70c64105..4b6aa99d 100644 --- a/lib/helper.js +++ b/lib/helper.js @@ -1,3 +1,5 @@ +const crypto = require('crypto'); +const _ = require('lodash'); const debugRequest = require('debug')('hccrawler:request'); const debugBrowser = require('debug')('hccrawler:browser'); @@ -12,6 +14,33 @@ class Util { return new Promise(resolve => setTimeout(resolve, milliseconds)); } + /** + * Get MD5 hashed hex string + * @param {String} src + * @return {String} hashed src + * @static + */ + static hash(src) { + const md5hash = crypto.createHash('md5'); + md5hash.update(src, 'binary'); + return md5hash.digest('hex'); + } + + /** + * Get a consistent object for JSON.stringify + * @param {String} key + * @param {*} val + * @return {*} ordered object + * @static + */ + static jsonStableReplacer(key, val) { + if (!_.isPlainObject(val)) return val; + return Object.keys(val).sort().reduce((obj, _key) => { + obj[_key] = val[_key]; /* eslint no-param-reassign: 0 */ + return obj; + }, {}); + } + /** * Debug log for request events * @param {string} msg diff --git a/package.json b/package.json index ba8c1952..b9991e1c 100644 --- a/package.json +++ b/package.json @@ -23,7 +23,8 @@ "jquery": "3.2.1", "lodash": "4.17.4", "p-queue": "2.3.0", - "puppeteer": "0.13.0" + "puppeteer": "0.13.0", + "redis": "2.8.0" }, "devDependencies": { "eslint": "4.11.0", diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js index e223de98..46871ddc 100644 --- a/test/hccrawler.test.js +++ b/test/hccrawler.test.js @@ -247,7 +247,7 @@ describe('HCCrawler', () => { }); context('when launched with maxRequest option', () => { - before(() => ( + beforeEach(() => ( HCCrawler.launch({ evaluatePage: _.noop, onSuccess: _.noop, @@ -259,7 +259,7 @@ describe('HCCrawler', () => { }) )); - after(() => crawler.close()); + afterEach(() => crawler.close()); it('requests until maxRequest', () => { assert.doesNotThrow(() => { @@ -267,10 +267,83 @@ describe('HCCrawler', () => { crawler.queue({ url: URL2 }); crawler.queue({ url: URL3 }); }); - return crawler.onEnd() + return crawler.onIdle() + .then(() => { + assert.equal(Crawler.prototype.crawl.callCount, 2); + }); + }); + + it('resumes after maxRequest', () => { + assert.doesNotThrow(() => { + crawler.queue({ url: URL1 }); + crawler.queue({ url: URL2 }); + crawler.queue({ url: URL3 }); + }); + return crawler.onIdle() + .then(() => { + crawler.setMaxRequest(3); + crawler.resume(); + return crawler.onIdle(); + }) + .then(() => { + assert.equal(Crawler.prototype.crawl.callCount, 3); + }); + }); + }); + + context('when launched with session cache', () => { + before(() => ( + HCCrawler.launch({ + evaluatePage: _.noop, + onSuccess: _.noop, + maxConcurrency: 1, + maxRequest: 2, + cache: new HCCrawler.SessionCache(), + }) + .then(_crawler => { + crawler = _crawler; + }) + )); + + after(() => crawler.close()); + + it('does not requested already cached url', () => { + assert.doesNotThrow(() => { + crawler.queue({ url: URL1 }); + crawler.queue({ url: URL2 }); + crawler.queue({ url: URL1 }); // The queue won't be requested + }); + return crawler.onIdle() + .then(() => { + assert.equal(Crawler.prototype.crawl.callCount, 2); + }); + }); + }); + + context('when launched with redis cache', () => { + before(() => ( + HCCrawler.launch({ + evaluatePage: _.noop, + onSuccess: _.noop, + maxConcurrency: 1, + maxRequest: 2, + cache: new HCCrawler.RedisCache(), + }) + .then(_crawler => { + crawler = _crawler; + }) + )); + + after(() => crawler.close()); + + it('does not requested already cached url', () => { + assert.doesNotThrow(() => { + crawler.queue({ url: URL1 }); + crawler.queue({ url: URL2 }); + crawler.queue({ url: URL1 }); // The queue won't be requested + }); + return crawler.onIdle() .then(() => { - assert.equal(crawler.queueSize, 1); - assert.equal(crawler.pendingQueueSize, 1); assert.equal(Crawler.prototype.crawl.callCount, 2); }); }); diff --git a/test/helper.test.js b/test/helper.test.js index fb6a14c9..711c80a7 100644 --- a/test/helper.test.js +++ b/test/helper.test.js @@ -1,5 +1,11 @@ const assert = require('assert'); -const { delay } = require('../lib/helper'); +const { + delay, + jsonStableReplacer, + hash, + debugRequest, + debugBrowser, +} = require('../lib/helper'); describe('Helper', () => { describe('Helper.delay', () => { @@ -23,4 +29,38 @@ describe('Helper', () => { }); }); }); + + describe('Helper.hash', () => { + it('returns the same results for same sources', () => { + const src = '{"url":"http://example.com/"}'; + const result1 = hash(src); + const result2 = hash(src); + assert.equal(result1, result2); + }); + }); + + describe('Helper.jsonStableReplacer', () => { + it('sorts key by order', () => { + const json = { c: 8, b: [{ z: 6, y: 5, x: 4 }, 7], a: 3 }; + const actual = '{"a":3,"b":[{"x":4,"y":5,"z":6},7],"c":8}'; + const expected = JSON.stringify(json, jsonStableReplacer); + assert.equal(actual, expected); + }); + }); + + describe('Helper.debugRequest', () => { + it('does not throw errors', () => { + assert.doesNotThrow(() => { + debugRequest('Start requesting http://example.com/'); + }); + }); + }); + + describe('Helper.debugBrowser', () => { + it('does not throw errors', () => { + assert.doesNotThrow(() => { + debugBrowser('Console log init.. http://example.com/'); + }); + }); + }); }); diff --git a/yarn.lock b/yarn.lock index d9dd3155..194cf1ec 100644 --- a/yarn.lock +++ b/yarn.lock @@ -327,6 +327,10 @@ doctrine@^2.0.0: dependencies: esutils "^2.0.2" +double-ended-queue@^2.1.0-0: + version "2.1.0-0" + resolved "https://registry.yarnpkg.com/double-ended-queue/-/double-ended-queue-2.1.0-0.tgz#103d3527fd31528f40188130c841efdd78264e5c" + eastasianwidth@^0.1.1: version "0.1.1" resolved "https://registry.yarnpkg.com/eastasianwidth/-/eastasianwidth-0.1.1.tgz#44d656de9da415694467335365fb3147b8572b7c" @@ -1267,6 +1271,22 @@ readable-stream@^2.2.2: string_decoder "~1.0.3" util-deprecate "~1.0.1" +redis-commands@^1.2.0: + version "1.3.1" + resolved "https://registry.yarnpkg.com/redis-commands/-/redis-commands-1.3.1.tgz#81d826f45fa9c8b2011f4cd7a0fe597d241d442b" + +redis-parser@^2.6.0: + version "2.6.0" + resolved "https://registry.yarnpkg.com/redis-parser/-/redis-parser-2.6.0.tgz#52ed09dacac108f1a631c07e9b69941e7a19504b" + +redis@2.8.0: + version "2.8.0" + resolved "https://registry.yarnpkg.com/redis/-/redis-2.8.0.tgz#202288e3f58c49f6079d97af7a10e1303ae14b02" + dependencies: + double-ended-queue "^2.1.0-0" + redis-commands "^1.2.0" + redis-parser "^2.6.0" + repeat-string@^1.5.2: version "1.6.1" resolved "https://registry.yarnpkg.com/repeat-string/-/repeat-string-1.6.1.tgz#8dcae470e1c88abc2d600fff4a776286da75e637" From 2eb463225baf1bf75d6a540351fccf0fd16496e2 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Sun, 10 Dec 2017 16:52:59 +0900 Subject: [PATCH 2/2] Add Redis image to CircleCI settings --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d461c3e8..9015465a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,6 +3,7 @@ jobs: build: docker: - image: circleci/node:6.10 + - image: redis steps: - checkout - run: