From 8d5d4c051960b7a334a4766358037a21b60bc6c6 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Mon, 25 Dec 2017 10:44:31 +0900 Subject: [PATCH] Better debug logs --- README.md | 71 ++++++++++--- lib/crawler.js | 16 ++- lib/hccrawler.js | 72 ++++++++----- lib/helper.js | 68 +++++++++++-- test/hccrawler.test.js | 224 +++++++++++++++++++++-------------------- test/helper.test.js | 64 ++++++++++-- 6 files changed, 347 insertions(+), 168 deletions(-) diff --git a/README.md b/README.md index fd78606d..0fb77d49 100644 --- a/README.md +++ b/README.md @@ -105,10 +105,17 @@ NODE_PATH=../ node examples/priority-queue.js * [crawler.version()](#crawlerversion) * [crawler.wsEndpoint()](#crawlerwsendpoint) * [crawler.onIdle()](#crawleronidle) - * [crawler.isPaused](#crawlerispaused) - * [crawler.queueSize](#crawlerqueuesize) - * [crawler.pendingQueueSize](#crawlerpendingqueuesize) - * [crawler.requestedCount](#crawlerrequestedcount) + * [crawler.isPaused()](#crawlerispaused) + * [crawler.queueSize()](#crawlerqueuesize) + * [crawler.pendingQueueSize()](#crawlerpendingqueuesize) + * [crawler.requestedCount()](#crawlerrequestedcount) + * [event: 'requeststarted'](#event-requeststarted) + * [event: 'requestskipped'](#event-requestskipped) + * [event: 'requestfinished'](#event-requestfinished) + * [event: 'requestfailed'](#event-requestfailed) + * [event: 'maxdepthreached'](#event-maxdepthreached) + * [event: 'maxrequestreached'](#event-maxrequestreached) + * [event: 'disconnected'](#event-disconnected) * [class: SessionCache](#class-sessioncache) * [class: RedisCache](#class-rediscache) * [class: BaseCache](#class-basecache) @@ -278,23 +285,61 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer #### crawler.onIdle() -- returns: <[Promise]> Promise resolved when queues become empty or paused. +returns: <[Promise]> Promise resolved when queues become empty or paused. -#### crawler.isPaused +#### crawler.isPaused() -* returns: <[boolean]> Whether the queue is paused. This property is read only. +* returns: <[boolean]> Whether the queue is paused. -#### crawler.queueSize +#### crawler.queueSize() -* returns: <[number]> The size of queues. This property is read only. +* returns: <[number]> The size of queues. -#### crawler.pendingQueueSize +#### crawler.pendingQueueSize() -* returns: <[number]> The size of pending queues. This property is read only. +* returns: <[number]> The size of pending queues. -#### crawler.requestedCount +#### crawler.requestedCount() -* returns: <[number]> The count of total requests. This property is read only. +* returns: <[number]> The count of total requests. + +#### event: 'requeststarted' + +* `options` <[Object]> + +Emitted when a request started. + +#### event: 'requestskipped' + +* `options` <[Object]> + +Emitted when a request is skipped. + +#### event: 'requestfinished' + +* `options` <[Object]> + +Emitted when a request finished successfully. + +#### event: 'requestfailed' + +* `options` <[Object]> + +Emitted when a request failed. + +#### event: 'maxdepthreached' + +* `options` <[Object]> + +Emitted when a queue reached the [crawler.queue()](#crawlerqueueoptions)'s `maxDepth` option. + +#### event: 'maxrequestreached' + +Emitted when a queue reached the [HCCrawler.connect()](#hccrawlerconnectoptions) or [HCCrawler.launch()](#hccrawlerlaunchoptions)'s `maxRequest` option. + +#### event: 'disconnected' + +Emitted when the browser instance is disconnected. ### class: SessionCache diff --git a/lib/crawler.js b/lib/crawler.js index 310a3bd1..21fa1ec3 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -5,7 +5,12 @@ const { noop, } = require('lodash'); const devices = require('puppeteer/DeviceDescriptors'); -const { resolveUrl, debugRequest, debugBrowser } = require('./helper'); +const { + resolveUrl, + debugConsole, + debugDialog, + tracePublicAPI, +} = require('./helper'); const GOTO_OPTIONS = [ 'timeout', @@ -122,9 +127,8 @@ class Crawler { * @private */ _handlePageEvents() { - this._page.on('load', () => void debugRequest(`Page loaded for ${this._options.url}`)); - this._page.on('pageerror', msg => void debugRequest(msg)); - this._page.on('console', msg => void debugBrowser(`Console ${msg.type} ${msg.text} for ${this._options.url}`)); + this._page.on('pageerror', msg => void debugConsole(msg)); + this._page.on('console', msg => void debugConsole(`${msg.type} ${msg.text} at ${this._options.url}`)); this._page.on('dialog', dialog => this._handleDialog(dialog, this._options)); } @@ -134,7 +138,7 @@ class Crawler { * @private */ _handleDialog(dialog) { - debugBrowser(`Dialog ${dialog.type} ${dialog.message()} for ${this._options.url}`); + debugDialog(`${dialog.type} ${dialog.message()} at ${this._options.url}`); return dialog.dismiss(); } @@ -210,4 +214,6 @@ class Crawler { } } +tracePublicAPI(Crawler); + module.exports = Crawler; diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 0adb8b86..a860b0c3 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -1,8 +1,5 @@ -const URL = require('url'); -const PQueue = require('p-queue'); -const Puppeteer = require('puppeteer'); -const devices = require('puppeteer/DeviceDescriptors'); -const Crawler = require('./crawler'); +const EventEmitter = require('events'); +const { parse } = require('url'); const { pick, omit, @@ -15,8 +12,16 @@ const { isString, isArray, } = require('lodash'); +const PQueue = require('p-queue'); +const Puppeteer = require('puppeteer'); +const devices = require('puppeteer/DeviceDescriptors'); +const { + delay, + generateKey, + tracePublicAPI, +} = require('./helper'); +const Crawler = require('./crawler'); const SessionCache = require('../cache/session'); -const { delay, generateKey, debugRequest } = require('./helper'); const PUPPETEER_CONNECT_OPTIONS = [ 'browserWSEndpoint', @@ -52,7 +57,7 @@ const RESPONSE_FIELDS = [ const deviceNames = Object.keys(devices); -class HCCrawler { +class HCCrawler extends EventEmitter { /** * @param {Object=} options * @return {Promise} @@ -85,6 +90,7 @@ class HCCrawler { * @param {!Object} options */ constructor(browser, options) { + super(); this._browser = browser; this._options = extend({ maxDepth: 1, @@ -102,6 +108,7 @@ class HCCrawler { this._pQueue = new PQueue({ concurrency: this._options.maxConcurrency }); this._requestedCount = 0; this._exportHeader(); + this._browser.on('disconnected', () => void this.emit(HCCrawler.Events.Disconnected)); } /** @@ -197,28 +204,28 @@ class HCCrawler { /** * @return {bolean} */ - get isPaused() { - return this._pQueue.isPaused(); + isPaused() { + return this._pQueue.isPaused; } /** * @return {number} */ - get queueSize() { + queueSize() { return this._pQueue.size; } /** * @return {number} */ - get pendingQueueSize() { + pendingQueueSize() { return this._pQueue.pending; } /** * @return {number} */ - get requestedCount() { + requestedCount() { return this._requestedCount; } @@ -230,9 +237,9 @@ class HCCrawler { * @private */ _request(options, depth = 1, retryCount = 0) { - if (retryCount === 0) debugRequest(`Start requesting ${options.url}`); + if (retryCount === 0) this.emit(HCCrawler.Events.RequestStarted, options); if (!this._checkAllowedDomains(options)) { - debugRequest(`Skip requesting ${options.url}`); + this.emit(HCCrawler.Events.RequestSkipped, options); return Promise.resolve(); } return Promise.all([ @@ -241,18 +248,18 @@ class HCCrawler { ]) .then(([exists, shouldRequest]) => { if (exists || !shouldRequest) { - debugRequest(`Skip requesting ${options.url}`); + this.emit(HCCrawler.Events.RequestSkipped, options); return Promise.resolve(); } return this._newPage(options) .then(crawler => ( crawler.crawl() .then(res => { + this.emit(HCCrawler.Events.RequestFinished, options); res.response = pick(res.response, RESPONSE_FIELDS); res.options = options; const onSuccess = options.onSuccess || noop; - Promise.resolve(onSuccess(res)) - .then(() => void debugRequest(`End requesting ${options.url}`)) + return Promise.resolve(onSuccess(res)) .then(() => void this._exportLine(res)) .then(() => void this._followLinks(res.links, options, depth)) .then(() => void this._checkRequestCount()) @@ -260,18 +267,19 @@ class HCCrawler { .then(() => delay(options.delay)); }) .catch(error => { + this.emit(HCCrawler.Events.RequestFailed, error); if (retryCount >= options.retryCount) throw error; - debugRequest(error.message); return crawler.close() .then(() => delay(options.retryDelay)) .then(() => this._removeExists(options)) .then(() => this._request(options, depth, retryCount + 1)); }) .catch(error => { - debugRequest(`Retry give-up for requesting ${options.url} after ${retryCount} tries`); const onError = options.onError || noop; - return crawler.close() - .then(() => Promise.resolve(onError(error))); + return Promise.resolve(onError(error)) + .then(() => this._checkRequestCount()) + .then(() => crawler.close()) + .then(() => delay(options.delay)); }) )); }); @@ -283,7 +291,7 @@ class HCCrawler { * @private */ _checkAllowedDomains(options) { - const { hostname } = URL.parse(options.url); + const { hostname } = parse(options.url); if (!options.allowedDomains) return true; return some(options.allowedDomains, domain => endsWith(hostname, domain)); } @@ -338,10 +346,13 @@ class HCCrawler { * @private */ _followLinks(links, options, depth) { - if (depth >= options.maxDepth) return; + if (depth >= options.maxDepth) { + this.emit(HCCrawler.Events.MaxDepthReached); + return; + } each(links, link => { const _options = extend({}, options, { url: link }); - this._pQueue.add(() => this._request(_options, depth), { + this._pQueue.add(() => this._request(_options, depth + 1), { priority: _options.priority, }); }); @@ -353,6 +364,7 @@ class HCCrawler { _checkRequestCount() { this._requestedCount += 1; if (this._options.maxRequest && this._requestedCount >= this._options.maxRequest) { + this.emit(HCCrawler.Events.MaxRequestReached); this.pause(); } } @@ -408,4 +420,16 @@ class HCCrawler { } } +HCCrawler.Events = { + RequestStarted: 'requeststarted', + RequestSkipped: 'requestskipped', + RequestFinished: 'requestfinished', + RequestFailed: 'requestfailed', + MaxDepthReached: 'maxdepthreached', + MaxRequestReached: 'maxrequestreached', + Disconnected: 'disconnected', +}; + +tracePublicAPI(HCCrawler); + module.exports = HCCrawler; diff --git a/lib/helper.js b/lib/helper.js index 0ca07c1c..75c6de79 100644 --- a/lib/helper.js +++ b/lib/helper.js @@ -1,4 +1,5 @@ -const URL = require('url'); +const { inspect } = require('util'); +const { parse, resolve } = require('url'); const crypto = require('crypto'); const { omit, @@ -6,9 +7,13 @@ const { trim, startsWith, includes, + isString, + isFunction, } = require('lodash'); -const debugRequest = require('debug')('hccrawler:request'); -const debugBrowser = require('debug')('hccrawler:browser'); +const debug = require('debug'); + +const debugConsole = debug('hccrawler:console'); +const debugDialog = debug('hccrawler:dialog'); const OMITTED_HASH_FIELDS = [ 'priority', @@ -35,7 +40,7 @@ class Helper { * @return {Promise} */ static delay(milliseconds) { - return new Promise(resolve => setTimeout(resolve, milliseconds)); + return new Promise(_resolve => setTimeout(_resolve, milliseconds)); } /** @@ -79,15 +84,20 @@ class Helper { url = trim(url); if (!url) return null; if (startsWith(url, '#')) return null; - const { protocol } = URL.parse(url); + const { protocol } = parse(url); if (includes(['http:', 'https:'], protocol)) { return url.split('#')[0]; } else if (!protocol) { - return URL.resolve(baseUrl, url).split('#')[0]; + return resolve(baseUrl, url).split('#')[0]; } return null; } + /** + * @param {!string} value + * @param {string=} separator + * @return {string} + */ static escapeQuotes(value, separator = ',') { if (value === null || value === undefined) return ''; const regExp = new RegExp(`["${separator}\\r\\n]`); @@ -95,18 +105,56 @@ class Helper { return value; } + + /** + * @param {!Object} classType + */ + static tracePublicAPI(classType) { + const className = classType.prototype.constructor.name.toLowerCase(); + const debugClass = debug(`hccrawler:${className}`); + Reflect.ownKeys(classType.prototype).forEach(methodName => { + if (methodName === 'constructor' || !isString(methodName) || startsWith(methodName, '_')) return; + const method = Reflect.get(classType.prototype, methodName); + if (!isFunction(method)) return; + Reflect.set(classType.prototype, methodName, function (...args) { + const argsText = args.map(Helper.stringifyArgument).join(', '); + debugClass(`${methodName}(${argsText})`); + return method.call(this, ...args); + }); + }); + if (classType.Events) { + const method = Reflect.get(classType.prototype, 'emit'); + Reflect.set(classType.prototype, 'emit', function (event, ...args) { + const argsText = [JSON.stringify(event)].concat(args.map(Helper.stringifyArgument)).join(', '); + debugClass(`emit(${argsText})`); + return method.call(this, event, ...args); + }); + } + } + + /** + * @param {!Object} arg + * @return {string} + */ + static stringifyArgument(arg) { + return inspect(arg) + .split('\n') + .map(line => trim(line)) + .join(' '); + } + /** * @param {!string} msg */ - static debugRequest(msg) { - debugRequest(msg); + static debugConsole(msg) { + debugConsole(msg); } /** * @param {!string} msg */ - static debugBrowser(msg) { - debugBrowser(msg); + static debugDialog(msg) { + debugDialog(msg); } } diff --git a/test/hccrawler.test.js b/test/hccrawler.test.js index 350a0197..d9dcdd3c 100644 --- a/test/hccrawler.test.js +++ b/test/hccrawler.test.js @@ -54,12 +54,10 @@ describe('HCCrawler', () => { }); it('crawls when queueing necessary options', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1 }); - }); + crawler.queue({ url: URL1 }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); }); }); }); @@ -75,53 +73,43 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('crawls when queueing a string', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - }); + crawler.queue(URL1); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); }); }); it('crawls when queueing multiple strings', () => { - assert.doesNotThrow(() => { - crawler.queue([URL1, URL2, URL3]); - }); + crawler.queue([URL1, URL2, URL3]); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.requestedCount(), 3); }); }); it('crawls when queueing an object', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1 }); - }); + crawler.queue({ url: URL1 }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); }); }); it('crawls when queueing multiple objects', () => { - assert.doesNotThrow(() => { - crawler.queue([{ url: URL1 }, { url: URL2 }, { url: URL3 }]); - }); + crawler.queue([{ url: URL1 }, { url: URL2 }, { url: URL3 }]); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.requestedCount(), 3); }); }); it('crawls when queueing mixed styles', () => { - assert.doesNotThrow(() => { - crawler.queue([URL1, { url: URL2 }]); - crawler.queue(URL3); - }); + crawler.queue([URL1, { url: URL2 }]); + crawler.queue(URL3); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.requestedCount(), 3); }); }); @@ -141,12 +129,13 @@ describe('HCCrawler', () => { function preRequest() { return Promise.resolve(true); } - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, preRequest }); - }); + let requestskipped = false; + crawler.on('requestskipped', () => { requestskipped = true; }); + crawler.queue({ url: URL1, preRequest }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(requestskipped, false); + assert.equal(crawler.requestedCount(), 1); }); }); @@ -154,12 +143,13 @@ describe('HCCrawler', () => { function preRequest() { return Promise.resolve(false); } - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, preRequest }); - }); + let requestskipped = false; + crawler.on('requestskipped', () => { requestskipped = true; }); + crawler.queue({ url: URL1, preRequest }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 0); + assert.equal(requestskipped, true); + assert.equal(crawler.requestedCount(), 0); }); }); @@ -169,9 +159,7 @@ describe('HCCrawler', () => { options.screenshot = { path }; return Promise.resolve(true); } - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, preRequest }); - }); + crawler.queue({ url: URL1, preRequest }); return crawler.onIdle() .then(() => { const { screenshot } = Crawler.prototype.crawl.firstCall.thisValue._options; @@ -180,22 +168,37 @@ describe('HCCrawler', () => { }); it('crawls when the requested domain is allowed', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, allowedDomains: ['example.com', 'example.net'] }); - }); + let requestskipped = false; + crawler.on('requestskipped', () => { requestskipped = true; }); + crawler.queue({ url: URL1, allowedDomains: ['example.com', 'example.net'] }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(requestskipped, false); + assert.equal(crawler.requestedCount(), 1); }); }); it('skips crawling when the requested domain is not allowed', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, allowedDomains: ['example.net', 'example.org'] }); - }); + let requestskipped = false; + crawler.on('requestskipped', () => { requestskipped = true; }); + crawler.queue({ url: URL1, allowedDomains: ['example.net', 'example.org'] }); + return crawler.onIdle() + .then(() => { + assert.equal(requestskipped, true); + assert.equal(crawler.requestedCount(), 0); + }); + }); + + it('emits request events', () => { + let requeststarted = false; + let requestfinished = false; + crawler.on('requeststarted', () => { requeststarted = true; }); + crawler.on('requestfinished', () => { requestfinished = true; }); + crawler.queue(URL1); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 0); + assert.equal(requeststarted, true); + assert.equal(requestfinished, true); }); }); }); @@ -211,12 +214,10 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('overrides device by queueing options', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, device: 'Nexus 6' }); - }); + crawler.queue({ url: URL1, device: 'Nexus 6' }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); assert.equal(Crawler.prototype.crawl.firstCall.thisValue._options.device, 'Nexus 6'); }); }); @@ -233,12 +234,13 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('automatically follows links', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - }); + let maxdepthreached = false; + crawler.on('maxdepthreached', () => { maxdepthreached = true; }); + crawler.queue(URL1); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 2); + assert.equal(maxdepthreached, true); + assert.equal(crawler.requestedCount(), 2); }); }); }); @@ -254,14 +256,12 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('obeys priority order', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1 }); - crawler.queue({ url: URL2, priority: 1 }); - crawler.queue({ url: URL3, priority: 2 }); - }); + crawler.queue({ url: URL1 }); + crawler.queue({ url: URL2, priority: 1 }); + crawler.queue({ url: URL3, priority: 2 }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.requestedCount(), 3); assert.equal(Crawler.prototype.crawl.firstCall.thisValue._options.url, URL1); assert.equal(Crawler.prototype.crawl.secondCall.thisValue._options.url, URL3); assert.equal(Crawler.prototype.crawl.thirdCall.thisValue._options.url, URL2); @@ -269,12 +269,10 @@ describe('HCCrawler', () => { }); it('does not throw an error when delay option is set', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, delay: 100 }); - }); + crawler.queue({ url: URL1, delay: 100 }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); }); }); }); @@ -290,36 +288,42 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('pauses at maxRequest option', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - crawler.queue(URL3); - }); + let maxrequestreached = false; + crawler.on('maxrequestreached', () => { maxrequestreached = true; }); + crawler.queue(URL1); + crawler.queue(URL2); + crawler.queue(URL3); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 2); + assert.equal(maxrequestreached, true); + assert.equal(crawler.isPaused(), true); + assert.equal(crawler.queueSize(), 1); + assert.equal(crawler.requestedCount(), 2); }); }); it('resumes from maxRequest option', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - crawler.queue(URL3); - }); + crawler.queue(URL1); + crawler.queue(URL2); + crawler.queue(URL3); return crawler.onIdle() .then(() => { - crawler.setMaxRequest(3); + assert.equal(crawler.isPaused(), true); + assert.equal(crawler.queueSize(), 1); + crawler.setMaxRequest(4); crawler.resume(); + assert.equal(crawler.isPaused(), false); return crawler.onIdle(); }) .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.isPaused(), false); + assert.equal(crawler.queueSize(), 0); + assert.equal(crawler.requestedCount(), 3); }); }); }); - context('when the crawler is launched with crawler option', () => { + context('when the crawler is launched with exporter option', () => { function removeTemporaryFile(file) { return new Promise(resolve => { unlink(file, (() => void resolve())); @@ -356,10 +360,8 @@ describe('HCCrawler', () => { )); it('exports a CSV file', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - }); + crawler.queue(URL1); + crawler.queue(URL2); return crawler.onIdle() .then(() => readTemporaryFile(CSV_FILE)) .then(actual => { @@ -388,10 +390,8 @@ describe('HCCrawler', () => { )); it('exports a json-line file', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - }); + crawler.queue(URL1); + crawler.queue(URL2); return crawler.onIdle() .then(() => readTemporaryFile(JSON_FILE)) .then(actual => { @@ -415,14 +415,12 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('does not crawl already cached url', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - crawler.queue(URL1); // The queue won't be requested - }); + crawler.queue(URL1); + crawler.queue(URL2); + crawler.queue(URL1); // The queue won't be requested return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 2); + assert.equal(crawler.requestedCount(), 2); }); }); }); @@ -440,13 +438,11 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('crawls all queued urls', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - }); + crawler.queue(URL1); + crawler.queue(URL2); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 2); + assert.equal(crawler.requestedCount(), 2); }); }); }); @@ -462,13 +458,11 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('does not crawl already cached url', () => { - assert.doesNotThrow(() => { - crawler.queue(URL2); - crawler.queue(URL3); - }); + crawler.queue(URL2); + crawler.queue(URL3); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 1); + assert.equal(crawler.requestedCount(), 1); }); }); }); @@ -485,17 +479,26 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('crawls duplicate urls', () => { - assert.doesNotThrow(() => { - crawler.queue(URL1); - crawler.queue(URL2); - crawler.queue(URL1); // The queue will be requested - }); + crawler.queue(URL1); + crawler.queue(URL2); + crawler.queue(URL1); // The queue will be requested return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 3); + assert.equal(crawler.requestedCount(), 3); }); }); }); + + it('emits disconnect event', () => { + let disconnected = false; + return HCCrawler.launch() + .then(_crawler => { + crawler = _crawler; + }) + .then(() => void crawler.on('disconnected', () => { disconnected = true; })) + .then(() => crawler.close()) + .then(() => void assert.equal(disconnected, true)); + }); }); context('when crawling fails', () => { @@ -511,12 +514,13 @@ describe('HCCrawler', () => { afterEach(() => crawler.close()); it('retries and gives up', () => { - assert.doesNotThrow(() => { - crawler.queue({ url: URL1, retryCount: 3, retryDelay: 100 }); - }); + let requestfailed = false; + crawler.on('requestfailed', () => { requestfailed = true; }); + crawler.queue({ url: URL1, retryCount: 3, retryDelay: 100 }); return crawler.onIdle() .then(() => { - assert.equal(Crawler.prototype.crawl.callCount, 4); + assert.equal(requestfailed, true); + assert.equal(crawler.requestedCount(), 1); }); }); }); diff --git a/test/helper.test.js b/test/helper.test.js index 16965582..5f6101b6 100644 --- a/test/helper.test.js +++ b/test/helper.test.js @@ -1,4 +1,5 @@ const assert = require('assert'); +const { noop } = require('lodash'); const { delay, jsonStableReplacer, @@ -6,8 +7,9 @@ const { generateKey, resolveUrl, escapeQuotes, - debugRequest, - debugBrowser, + stringifyArgument, + debugConsole, + debugDialog, } = require('../lib/helper'); describe('Helper', () => { @@ -147,18 +149,68 @@ describe('Helper', () => { }); }); - describe('Helper.debugRequest', () => { + describe('Helper.stringifyArgument', () => { + it('stringifies undefined', () => { + const actual = stringifyArgument(undefined); + const expected = 'undefined'; + assert.equal(actual, expected); + }); + + it('stringifies null', () => { + const actual = stringifyArgument(null); + const expected = 'null'; + assert.equal(actual, expected); + }); + + it('stringifies boolean', () => { + const actual = stringifyArgument(false); + const expected = 'false'; + assert.equal(actual, expected); + }); + + it('stringifies string', () => { + const actual = stringifyArgument('https://github.com/yujiosaka/headless-chrome-crawler'); + const expected = "'https://github.com/yujiosaka/headless-chrome-crawler'"; + assert.equal(actual, expected); + }); + + it('stringifies number', () => { + const actual = stringifyArgument(3); + const expected = '3'; + assert.equal(actual, expected); + }); + + it('stringifies function', () => { + const actual = stringifyArgument(noop); + const expected = '[Function: noop]'; + assert.equal(actual, expected); + }); + + it('stringifies object', () => { + const actual = stringifyArgument({ + jQuery: false, + url: 'https://github.com/yujiosaka/headless-chrome-crawler', + retryCount: 3, + evaluatePage: noop, + cache: null, + }); + const expected = "{ jQuery: false, url: 'https://github.com/yujiosaka/headless-chrome-crawler', retryCount: 3, evaluatePage: [Function: noop], cache: null }"; + assert.equal(actual, expected); + }); + }); + + describe('Helper.debugConsole', () => { it('does not throw an error', () => { assert.doesNotThrow(() => { - debugRequest('Start requesting https://github.com/yujiosaka/headless-chrome-crawler'); + debugConsole('log init at https://github.com/yujiosaka/headless-chrome-crawler'); }); }); }); - describe('Helper.debugBrowser', () => { + describe('Helper.debugDialog', () => { it('does not throw an error', () => { assert.doesNotThrow(() => { - debugBrowser('Console log init https://github.com/yujiosaka/headless-chrome-crawler'); + debugDialog('beforeUnload This page is asking you to confirm that you want to leave - data you have entered may not be saved. at https://github.com/yujiosaka/headless-chrome-crawler'); }); }); });