From 96d13f2195a571678f576567f4d7144ee054dfb6 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Sun, 10 Jun 2018 19:46:30 +0900 Subject: [PATCH] feat(hccrawler): replace newpage by custom crawl --- docs/API.md | 7 ------- docs/CHANGELOG.md | 17 ++++++++++------ examples/custom-crawl.js | 29 +++++++++++++++++++++++++++ lib/crawler.js | 15 ++++++++++---- lib/hccrawler.js | 28 +++++++++++++++++--------- test/hccrawler/index.test.js | 39 ++++++++++++++++++++++++------------ 6 files changed, 96 insertions(+), 39 deletions(-) create mode 100644 examples/custom-crawl.js diff --git a/docs/API.md b/docs/API.md index d71953d2..2e1a51e8 100644 --- a/docs/API.md +++ b/docs/API.md @@ -22,7 +22,6 @@ * [crawler.queueSize()](#crawlerqueuesize) * [crawler.pendingQueueSize()](#crawlerpendingqueuesize) * [crawler.requestedCount()](#crawlerrequestedcount) - * [event: 'newpage'](#event-newpage) * [event: 'requestdisallowed'](#event-requestdisallowed) * [event: 'requeststarted'](#event-requeststarted) * [event: 'requestskipped'](#event-requestskipped) @@ -280,12 +279,6 @@ This method clears the cache when it's used. * returns: <[number]> The count of total requests. -### event: 'newpage' - -* `page` <[Page]> - -Emitted when a [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s page is opened. - ### event: 'requestdisallowed' * `options` <[Object]> diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 8e96afa9..6fdb7cb9 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Set `previousUrl` to `onSuccess` argument. - Set `options`, `depth`, `previousUrl` to errors. +- Support `customCrawl` for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options. + +### Changed + +- Drop `newpage` event. ### Fixed @@ -24,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Support `cookies` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. - Make `onSuccess` pass `cookies` in the response. -### changed +### Changed - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.4.0. @@ -36,7 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Emit `requestdisallowed` event. - Make `onSuccess` pass `redirectChain` in the response. -### changed +### Changed - Bump Node.js version up to 8.10.0. - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.3.0. @@ -68,7 +73,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [1.3.4] - 2018-02-22 -### changed +### Changed - Drop `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. @@ -79,7 +84,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Emit `newpage` event. - Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. -### changed +### Changed - Allow `allowedDomains` option to accept a list of regular expressions. @@ -106,7 +111,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add [HCCrawler.defaultArgs()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerdefaultargs) method. - Emit `requestretried` event. -### changed +### Changed - Use `cache` option not only for remembering already requested URLs but for request queue for distributed environments. - Moved `onSuccess`, `onError` and `maxDepth` options from [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions) to [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions). @@ -117,7 +122,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Support `obeyRobotsTxt` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. - Support `persist` for [RedisCache](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#rediscache)'s constructing options. -### changed +### Changed - Make `cache` to be required for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options. - Provide `skipDuplicates` to remember and skip duplicate URLs, instead of passing `null` to `cache` option. diff --git a/examples/custom-crawl.js b/examples/custom-crawl.js new file mode 100644 index 00000000..eb88ee63 --- /dev/null +++ b/examples/custom-crawl.js @@ -0,0 +1,29 @@ +const HCCrawler = require('headless-chrome-crawler'); + +(async () => { + const crawler = await HCCrawler.launch({ + customCrawl: async (page, crawl) => { + // You can access the page object before requests + await page.setRequestInterception(true); + page.on('request', request => { + if (request.url().endsWith('/')) { + request.continue(); + } else { + request.abort(); + } + }); + // The result contains options, links, cookies and etc. + const result = await crawl(); + // You can access the page object after requests + result.content = await page.content(); + // You need to extend and return the crawled result + return result; + }, + onSuccess: result => { + console.log(`Got ${result.content} for ${result.options.url}.`); + }, + }); + await crawler.queue('https://example.com/'); + await crawler.onIdle(); + await crawler.close(); +})(); diff --git a/lib/crawler.js b/lib/crawler.js index 0c3ca058..29e72c39 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -32,14 +32,18 @@ class Crawler { /** * @param {!Puppeteer.Page} page * @param {!Object} options + * @param {!number} depth + * @param {string} previousUrl */ - constructor(page, options) { + constructor(page, options, depth, previousUrl) { this._page = page; this._options = options; + this._depth = depth; + this._previousUrl = previousUrl; } /** - * @return {!Promise} + * @return {!Promise} */ async crawl() { await this._prepare(); @@ -57,6 +61,9 @@ class Crawler { this._collectLinks(response.url), ]); return { + options: this._options, + depth: this._depth, + previousUrl: this._previousUrl, response: this._reduceResponse(response), redirectChain: this._getRedirectChain(response), result, @@ -248,7 +255,7 @@ class Crawler { } /** - * @return {!Promise} + * @return {!Promise} * @private */ async _screenshot() { @@ -266,7 +273,7 @@ class Crawler { /** * @param {!string} baseUrl - * @return {!Promise} + * @return {!Promise>} * @private */ async _collectLinks(baseUrl) { diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 0cfd2ae5..4c93b375 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -54,6 +54,7 @@ const CONSTRUCTOR_OPTIONS = CONNECT_OPTIONS.concat(LAUNCH_OPTIONS).concat([ 'preRequest', 'onSuccess', 'onError', + 'customizeCrawl', ]); const EMPTY_TXT = ''; @@ -62,7 +63,7 @@ const deviceNames = Object.keys(devices); class HCCrawler extends EventEmitter { /** * @param {!Object=} options - * @return {!Promise} + * @return {!Promise} */ static async connect(options) { const browser = await Puppeteer.connect(pick(options, CONNECT_OPTIONS)); @@ -73,7 +74,7 @@ class HCCrawler extends EventEmitter { /** * @param {!Object=} options - * @return {!Promise} + * @return {!Promise} */ static async launch(options) { const browser = await Puppeteer.launch(pick(options, LAUNCH_OPTIONS)); @@ -134,6 +135,7 @@ class HCCrawler extends EventEmitter { this._preRequest = options.preRequest || null; this._onSuccess = options.onSuccess || null; this._onError = options.onError || null; + this._customCrawl = options.customCrawl || null; this._exportHeader(); this._queue.on('pull', (_options, depth, previousUrl) => this._startRequest(_options, depth, previousUrl)); this._browser.on('disconnected', () => void this.emit(HCCrawler.Events.Disconnected)); @@ -331,17 +333,15 @@ class HCCrawler extends EventEmitter { */ async _request(options, depth, previousUrl, retryCount = 0) { this.emit(HCCrawler.Events.RequestStarted, options); - const crawler = await this._newPage(options); - this.emit(HCCrawler.Events.NewPage, crawler.page()); + const crawler = await this._newCrawler(options, depth, previousUrl); try { - const res = await crawler.crawl(); + const res = await this._crawl(crawler); await crawler.close(); this.emit(HCCrawler.Events.RequestFinished, options); const requested = await this._checkRequestedRedirect(options, res.response); await this._markRequested(options); await this._markRequestedRedirects(options, res.redirectChain, res.response); if (requested) return []; - extend(res, { options, depth, previousUrl }); this._exportLine(res); await this._success(res); return res.links; @@ -546,11 +546,22 @@ class HCCrawler extends EventEmitter { /** * @param {!Object} options * @return {!Promise} + * @param {!number} depth + * @param {string} previousUrl * @private */ - async _newPage(options) { + async _newCrawler(options, depth, previousUrl) { const page = await this._browser.newPage(); - return new Crawler(page, options); + return new Crawler(page, options, depth, previousUrl); + } + /** + * @param {!Crawler} crawler + * @return {!Promise} + */ + async _crawl(crawler) { + if (!this._customCrawl) return crawler.crawl(); + const crawl = () => crawler.crawl.call(crawler); + return this._customCrawl(crawler.page(), crawl); } /** @@ -633,7 +644,6 @@ class HCCrawler extends EventEmitter { } HCCrawler.Events = { - NewPage: 'newpage', RequestStarted: 'requeststarted', RequestSkipped: 'requestskipped', RequestDisallowed: 'requestdisallowed', diff --git a/test/hccrawler/index.test.js b/test/hccrawler/index.test.js index 555f38c8..0630130a 100644 --- a/test/hccrawler/index.test.js +++ b/test/hccrawler/index.test.js @@ -198,19 +198,6 @@ describe('HCCrawler', () => { } }); - test('emits a newpage event', async () => { - let request; - let response; - this.crawler.on('newpage', page => { - page.on('request', _request => { request = _request; }); - page.on('response', _response => { response = _response; }); - }); - await this.crawler.queue(INDEX_PAGE); - await this.crawler.onIdle(); - expect(request.response()).toBe(response); - expect(this.onSuccess).toHaveBeenCalledTimes(1); - }); - test('crawls when the requested domain exactly matches allowed domains', async () => { let requestskipped = 0; this.crawler.on('requestskipped', () => { requestskipped += 1; }); @@ -864,6 +851,32 @@ describe('HCCrawler', () => { }); }); + describe('when the crawler is launched with the customCrawl function', () => { + describe('when the customCrawl sets page content to the result', () => { + async function customCrawl(page, crawl) { + const result = await crawl(); + result.content = await page.content(); + return result; + } + + beforeEach(async () => { + this.crawler = await HCCrawler.launch(extend({ + onSuccess: this.onSuccess, + customCrawl, + }, DEFAULT_OPTIONS)); + }); + + test('resolves the page content', async () => { + const content = `

Welcome to ${INDEX_PAGE}

`; + this.server.setContent('/', content); + await this.crawler.queue(INDEX_PAGE); + await this.crawler.onIdle(); + expect(this.onSuccess).toHaveBeenCalledTimes(1); + expect(this.onSuccess.mock.calls[0][0].content).toContain(content); + }); + }); + }); + describe('when the crawler is launched with the exporter option', () => { function removeTemporaryFile(file) { return new Promise(resolve => {