Skip to content

Commit

Permalink
Merge pull request #50 from yujiosaka/useful_events
Browse files Browse the repository at this point in the history
Better debug logs
  • Loading branch information
yujiosaka authored Dec 25, 2017
2 parents 5c650a9 + 8d5d4c0 commit 934438c
Show file tree
Hide file tree
Showing 6 changed files with 347 additions and 168 deletions.
71 changes: 58 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,17 @@ NODE_PATH=../ node examples/priority-queue.js
* [crawler.version()](#crawlerversion)
* [crawler.wsEndpoint()](#crawlerwsendpoint)
* [crawler.onIdle()](#crawleronidle)
* [crawler.isPaused](#crawlerispaused)
* [crawler.queueSize](#crawlerqueuesize)
* [crawler.pendingQueueSize](#crawlerpendingqueuesize)
* [crawler.requestedCount](#crawlerrequestedcount)
* [crawler.isPaused()](#crawlerispaused)
* [crawler.queueSize()](#crawlerqueuesize)
* [crawler.pendingQueueSize()](#crawlerpendingqueuesize)
* [crawler.requestedCount()](#crawlerrequestedcount)
* [event: 'requeststarted'](#event-requeststarted)
* [event: 'requestskipped'](#event-requestskipped)
* [event: 'requestfinished'](#event-requestfinished)
* [event: 'requestfailed'](#event-requestfailed)
* [event: 'maxdepthreached'](#event-maxdepthreached)
* [event: 'maxrequestreached'](#event-maxrequestreached)
* [event: 'disconnected'](#event-disconnected)
* [class: SessionCache](#class-sessioncache)
* [class: RedisCache](#class-rediscache)
* [class: BaseCache](#class-basecache)
Expand Down Expand Up @@ -278,23 +285,61 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer

#### crawler.onIdle()

- returns: <[Promise]> Promise resolved when queues become empty or paused.
returns: <[Promise]> Promise resolved when queues become empty or paused.

#### crawler.isPaused
#### crawler.isPaused()

* returns: <[boolean]> Whether the queue is paused. This property is read only.
* returns: <[boolean]> Whether the queue is paused.

#### crawler.queueSize
#### crawler.queueSize()

* returns: <[number]> The size of queues. This property is read only.
* returns: <[number]> The size of queues.

#### crawler.pendingQueueSize
#### crawler.pendingQueueSize()

* returns: <[number]> The size of pending queues. This property is read only.
* returns: <[number]> The size of pending queues.

#### crawler.requestedCount
#### crawler.requestedCount()

* returns: <[number]> The count of total requests. This property is read only.
* returns: <[number]> The count of total requests.

#### event: 'requeststarted'

* `options` <[Object]>

Emitted when a request started.

#### event: 'requestskipped'

* `options` <[Object]>

Emitted when a request is skipped.

#### event: 'requestfinished'

* `options` <[Object]>

Emitted when a request finished successfully.

#### event: 'requestfailed'

* `options` <[Object]>

Emitted when a request failed.

#### event: 'maxdepthreached'

* `options` <[Object]>

Emitted when a queue reached the [crawler.queue()](#crawlerqueueoptions)'s `maxDepth` option.

#### event: 'maxrequestreached'

Emitted when a queue reached the [HCCrawler.connect()](#hccrawlerconnectoptions) or [HCCrawler.launch()](#hccrawlerlaunchoptions)'s `maxRequest` option.

#### event: 'disconnected'

Emitted when the browser instance is disconnected.

### class: SessionCache

Expand Down
16 changes: 11 additions & 5 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ const {
noop,
} = require('lodash');
const devices = require('puppeteer/DeviceDescriptors');
const { resolveUrl, debugRequest, debugBrowser } = require('./helper');
const {
resolveUrl,
debugConsole,
debugDialog,
tracePublicAPI,
} = require('./helper');

const GOTO_OPTIONS = [
'timeout',
Expand Down Expand Up @@ -122,9 +127,8 @@ class Crawler {
* @private
*/
_handlePageEvents() {
this._page.on('load', () => void debugRequest(`Page loaded for ${this._options.url}`));
this._page.on('pageerror', msg => void debugRequest(msg));
this._page.on('console', msg => void debugBrowser(`Console ${msg.type} ${msg.text} for ${this._options.url}`));
this._page.on('pageerror', msg => void debugConsole(msg));
this._page.on('console', msg => void debugConsole(`${msg.type} ${msg.text} at ${this._options.url}`));
this._page.on('dialog', dialog => this._handleDialog(dialog, this._options));
}

Expand All @@ -134,7 +138,7 @@ class Crawler {
* @private
*/
_handleDialog(dialog) {
debugBrowser(`Dialog ${dialog.type} ${dialog.message()} for ${this._options.url}`);
debugDialog(`${dialog.type} ${dialog.message()} at ${this._options.url}`);
return dialog.dismiss();
}

Expand Down Expand Up @@ -210,4 +214,6 @@ class Crawler {
}
}

tracePublicAPI(Crawler);

module.exports = Crawler;
72 changes: 48 additions & 24 deletions lib/hccrawler.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
const URL = require('url');
const PQueue = require('p-queue');
const Puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const Crawler = require('./crawler');
const EventEmitter = require('events');
const { parse } = require('url');
const {
pick,
omit,
Expand All @@ -15,8 +12,16 @@ const {
isString,
isArray,
} = require('lodash');
const PQueue = require('p-queue');
const Puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const {
delay,
generateKey,
tracePublicAPI,
} = require('./helper');
const Crawler = require('./crawler');
const SessionCache = require('../cache/session');
const { delay, generateKey, debugRequest } = require('./helper');

const PUPPETEER_CONNECT_OPTIONS = [
'browserWSEndpoint',
Expand Down Expand Up @@ -52,7 +57,7 @@ const RESPONSE_FIELDS = [

const deviceNames = Object.keys(devices);

class HCCrawler {
class HCCrawler extends EventEmitter {
/**
* @param {Object=} options
* @return {Promise}
Expand Down Expand Up @@ -85,6 +90,7 @@ class HCCrawler {
* @param {!Object} options
*/
constructor(browser, options) {
super();
this._browser = browser;
this._options = extend({
maxDepth: 1,
Expand All @@ -102,6 +108,7 @@ class HCCrawler {
this._pQueue = new PQueue({ concurrency: this._options.maxConcurrency });
this._requestedCount = 0;
this._exportHeader();
this._browser.on('disconnected', () => void this.emit(HCCrawler.Events.Disconnected));
}

/**
Expand Down Expand Up @@ -197,28 +204,28 @@ class HCCrawler {
/**
* @return {bolean}
*/
get isPaused() {
return this._pQueue.isPaused();
isPaused() {
return this._pQueue.isPaused;
}

/**
* @return {number}
*/
get queueSize() {
queueSize() {
return this._pQueue.size;
}

/**
* @return {number}
*/
get pendingQueueSize() {
pendingQueueSize() {
return this._pQueue.pending;
}

/**
* @return {number}
*/
get requestedCount() {
requestedCount() {
return this._requestedCount;
}

Expand All @@ -230,9 +237,9 @@ class HCCrawler {
* @private
*/
_request(options, depth = 1, retryCount = 0) {
if (retryCount === 0) debugRequest(`Start requesting ${options.url}`);
if (retryCount === 0) this.emit(HCCrawler.Events.RequestStarted, options);
if (!this._checkAllowedDomains(options)) {
debugRequest(`Skip requesting ${options.url}`);
this.emit(HCCrawler.Events.RequestSkipped, options);
return Promise.resolve();
}
return Promise.all([
Expand All @@ -241,37 +248,38 @@ class HCCrawler {
])
.then(([exists, shouldRequest]) => {
if (exists || !shouldRequest) {
debugRequest(`Skip requesting ${options.url}`);
this.emit(HCCrawler.Events.RequestSkipped, options);
return Promise.resolve();
}
return this._newPage(options)
.then(crawler => (
crawler.crawl()
.then(res => {
this.emit(HCCrawler.Events.RequestFinished, options);
res.response = pick(res.response, RESPONSE_FIELDS);
res.options = options;
const onSuccess = options.onSuccess || noop;
Promise.resolve(onSuccess(res))
.then(() => void debugRequest(`End requesting ${options.url}`))
return Promise.resolve(onSuccess(res))
.then(() => void this._exportLine(res))
.then(() => void this._followLinks(res.links, options, depth))
.then(() => void this._checkRequestCount())
.then(() => crawler.close())
.then(() => delay(options.delay));
})
.catch(error => {
this.emit(HCCrawler.Events.RequestFailed, error);
if (retryCount >= options.retryCount) throw error;
debugRequest(error.message);
return crawler.close()
.then(() => delay(options.retryDelay))
.then(() => this._removeExists(options))
.then(() => this._request(options, depth, retryCount + 1));
})
.catch(error => {
debugRequest(`Retry give-up for requesting ${options.url} after ${retryCount} tries`);
const onError = options.onError || noop;
return crawler.close()
.then(() => Promise.resolve(onError(error)));
return Promise.resolve(onError(error))
.then(() => this._checkRequestCount())
.then(() => crawler.close())
.then(() => delay(options.delay));
})
));
});
Expand All @@ -283,7 +291,7 @@ class HCCrawler {
* @private
*/
_checkAllowedDomains(options) {
const { hostname } = URL.parse(options.url);
const { hostname } = parse(options.url);
if (!options.allowedDomains) return true;
return some(options.allowedDomains, domain => endsWith(hostname, domain));
}
Expand Down Expand Up @@ -338,10 +346,13 @@ class HCCrawler {
* @private
*/
_followLinks(links, options, depth) {
if (depth >= options.maxDepth) return;
if (depth >= options.maxDepth) {
this.emit(HCCrawler.Events.MaxDepthReached);
return;
}
each(links, link => {
const _options = extend({}, options, { url: link });
this._pQueue.add(() => this._request(_options, depth), {
this._pQueue.add(() => this._request(_options, depth + 1), {
priority: _options.priority,
});
});
Expand All @@ -353,6 +364,7 @@ class HCCrawler {
_checkRequestCount() {
this._requestedCount += 1;
if (this._options.maxRequest && this._requestedCount >= this._options.maxRequest) {
this.emit(HCCrawler.Events.MaxRequestReached);
this.pause();
}
}
Expand Down Expand Up @@ -408,4 +420,16 @@ class HCCrawler {
}
}

HCCrawler.Events = {
RequestStarted: 'requeststarted',
RequestSkipped: 'requestskipped',
RequestFinished: 'requestfinished',
RequestFailed: 'requestfailed',
MaxDepthReached: 'maxdepthreached',
MaxRequestReached: 'maxrequestreached',
Disconnected: 'disconnected',
};

tracePublicAPI(HCCrawler);

module.exports = HCCrawler;
Loading

0 comments on commit 934438c

Please sign in to comment.