diff --git a/docs/API.md b/docs/API.md index d71953d2..21bfca7c 100644 --- a/docs/API.md +++ b/docs/API.md @@ -22,7 +22,6 @@ * [crawler.queueSize()](#crawlerqueuesize) * [crawler.pendingQueueSize()](#crawlerpendingqueuesize) * [crawler.requestedCount()](#crawlerrequestedcount) - * [event: 'newpage'](#event-newpage) * [event: 'requestdisallowed'](#event-requestdisallowed) * [event: 'requeststarted'](#event-requeststarted) * [event: 'requestskipped'](#event-requestskipped) @@ -73,8 +72,11 @@ const HCCrawler = require('headless-chrome-crawler'); * `persistCache` <[boolean]> Whether to clear cache on closing or disconnecting from the Chromium instance, defaults to `false`. * `preRequest(options)` <[Function]> Function to do anything like modifying `options` before each request. You can also return `false` if you want to skip the request. * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. - * `onSuccess(response)` <[Function]> Function to be called when `evaluatePage()` successes. - * `response` <[Object]> + * `customCrawl(page, crawl)` <[Function]> Function to customize crawled result, allowing access to [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s raw API. + * `page` <[Page]> [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s raw API. + * `crawl` <[Function]> Function to run crawling, which resolves to the result passed to `onSuccess` function. + * `onSuccess(result)` <[Function]> Function to be called when `evaluatePage()` successes. + * `result` <[Object]> * `redirectChain` <[Array]<[Object]>> Redirect chain of requests. * `url` <[string]> Requested url. * `headers` <[Object]> Request headers. @@ -130,8 +132,24 @@ url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, ret * `persistCache` <[boolean]> Whether to clear cache on closing or disconnecting from the Chromium instance, defaults to `false`. * `preRequest(options)` <[Function]> Function to do anything like modifying `options` before each request. You can also return `false` if you want to skip the request. * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. - * `onSuccess(response)` <[Function]> Function to be called when `evaluatePage()` successes. - * `response` <[Object]> + * `customCrawl(page, crawl)` <[Function]> Function to customize crawled result, allowing access to [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s raw API. + * `page` <[Page]> [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s raw API. + * `crawl` <[Function]> Function to run crawling, which resolves to the result passed to `onSuccess` function. + * `onSuccess(result)` <[Function]> Function to be called when `evaluatePage()` successes. + * `result` <[Object]> + * `redirectChain` <[Array]<[Object]>> Redirect chain of requests. + * `url` <[string]> Requested url. + * `headers` <[Object]> Request headers. + * `cookies` <[Array]<[Object]>> List of cookies. + * `name` <[string]> + * `value` <[string]> + * `domain` <[string]> + * `path` <[string]> + * `expires` <[number]> Unix time in seconds. + * `httpOnly` <[boolean]> + * `secure` <[boolean]> + * `session` <[boolean]> + * `sameSite` <[string]> `"Strict"` or `"Lax"`. * `response` <[Object]> * `ok` <[boolean]> whether the status code in the range 200-299 or not. * `status` <[string]> status code of the request. @@ -140,7 +158,7 @@ url, allowedDomains, deniedDomains, timeout, priority, depthPriority, delay, ret * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. * `result` <[Serializable]> The result resolved from `evaluatePage()` option. * `screenshot` <[Buffer]> Buffer with the screenshot image, which is `null` when `screenshot` option not passed. - * `links` <[Array]> List of links found in the requested page. + * `links` <[Array]<[string]>> List of links found in the requested page. * `depth` <[number]> Depth of the followed links. * `previousUrl` <[string]> The previous request's url. The value is `null` for the initial request. * `onError(error)` <[Function]> Function to be called when request fails. @@ -280,12 +298,6 @@ This method clears the cache when it's used. * returns: <[number]> The count of total requests. -### event: 'newpage' - -* `page` <[Page]> - -Emitted when a [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s page is opened. - ### event: 'requestdisallowed' * `options` <[Object]> diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 8e96afa9..6fdb7cb9 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Set `previousUrl` to `onSuccess` argument. - Set `options`, `depth`, `previousUrl` to errors. +- Support `customCrawl` for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options. + +### Changed + +- Drop `newpage` event. ### Fixed @@ -24,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Support `cookies` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. - Make `onSuccess` pass `cookies` in the response. -### changed +### Changed - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.4.0. @@ -36,7 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Emit `requestdisallowed` event. - Make `onSuccess` pass `redirectChain` in the response. -### changed +### Changed - Bump Node.js version up to 8.10.0. - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.3.0. @@ -68,7 +73,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [1.3.4] - 2018-02-22 -### changed +### Changed - Drop `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. @@ -79,7 +84,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Emit `newpage` event. - Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. -### changed +### Changed - Allow `allowedDomains` option to accept a list of regular expressions. @@ -106,7 +111,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add [HCCrawler.defaultArgs()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerdefaultargs) method. - Emit `requestretried` event. -### changed +### Changed - Use `cache` option not only for remembering already requested URLs but for request queue for distributed environments. - Moved `onSuccess`, `onError` and `maxDepth` options from [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions) to [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions). @@ -117,7 +122,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Support `obeyRobotsTxt` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options. - Support `persist` for [RedisCache](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#rediscache)'s constructing options. -### changed +### Changed - Make `cache` to be required for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options. - Provide `skipDuplicates` to remember and skip duplicate URLs, instead of passing `null` to `cache` option. diff --git a/examples/conditional-screenshot.js b/examples/conditional-screenshot.js index 3d39658f..a0a434c1 100644 --- a/examples/conditional-screenshot.js +++ b/examples/conditional-screenshot.js @@ -4,14 +4,14 @@ const PATH = './tmp/'; (async () => { const crawler = await HCCrawler.launch({ - onSuccess: (result => { + onSuccess: result => { console.log(`Screenshot is saved as ${PATH}${result.options.saveAs} for ${result.options.url}.`); - }), - preRequest: (options => { + }, + preRequest: options => { if (!options.saveAs) return false; // Skip the request by returning false options.screenshot = { path: `${PATH}${options.saveAs}` }; return true; - }), + }, }); await crawler.queue({ url: 'https://example.com/' }); // saveAs is a custom option for preRequest to conditionally modify options and skip requests diff --git a/examples/custom-cache.js b/examples/custom-cache.js index fbc43628..f2cbae45 100644 --- a/examples/custom-cache.js +++ b/examples/custom-cache.js @@ -63,9 +63,9 @@ const cache = new FsCache({ file: FILE }); (async () => { const crawler = await HCCrawler.launch({ maxConcurrency: 1, - onSuccess: (result => { + onSuccess: result => { console.log(`Requested ${result.options.url}.`); - }), + }, cache, }); await crawler.queue('https://example.com/'); diff --git a/examples/custom-crawl.js b/examples/custom-crawl.js new file mode 100644 index 00000000..eb88ee63 --- /dev/null +++ b/examples/custom-crawl.js @@ -0,0 +1,29 @@ +const HCCrawler = require('headless-chrome-crawler'); + +(async () => { + const crawler = await HCCrawler.launch({ + customCrawl: async (page, crawl) => { + // You can access the page object before requests + await page.setRequestInterception(true); + page.on('request', request => { + if (request.url().endsWith('/')) { + request.continue(); + } else { + request.abort(); + } + }); + // The result contains options, links, cookies and etc. + const result = await crawl(); + // You can access the page object after requests + result.content = await page.content(); + // You need to extend and return the crawled result + return result; + }, + onSuccess: result => { + console.log(`Got ${result.content} for ${result.options.url}.`); + }, + }); + await crawler.queue('https://example.com/'); + await crawler.onIdle(); + await crawler.close(); +})(); diff --git a/examples/emulate-device.js b/examples/emulate-device.js index 78b5e29c..35a08766 100644 --- a/examples/emulate-device.js +++ b/examples/emulate-device.js @@ -3,12 +3,12 @@ const HCCrawler = require('headless-chrome-crawler'); (async () => { const crawler = await HCCrawler.launch({ url: 'https://example.com/', - evaluatePage: (() => ({ + evaluatePage: () => ({ userAgent: window.navigator.userAgent, - })), - onSuccess: (result => { - console.log(`Emulated ${result.result.userAgent} for ${result.options.url}.`); }), + onSuccess: result => { + console.log(`Emulated ${result.result.userAgent} for ${result.options.url}.`); + }, }); await crawler.queue({ device: 'Nexus 7' }); await crawler.queue({ userAgent: 'headless-chrome-crawler' }); // Only override userAgent diff --git a/examples/multiple-queue.js b/examples/multiple-queue.js index 94467ba0..cbcf7af9 100644 --- a/examples/multiple-queue.js +++ b/examples/multiple-queue.js @@ -2,12 +2,12 @@ const HCCrawler = require('headless-chrome-crawler'); (async () => { const crawler = await HCCrawler.launch({ - evaluatePage: (() => ({ + evaluatePage: () => ({ title: $('title').text(), - })), - onSuccess: (result => { - console.log(`Got ${result.result.title} for ${result.options.url}.`); }), + onSuccess: result => { + console.log(`Got ${result.result.title} for ${result.options.url}.`); + }, }); await crawler.queue('https://example.com/'); // Queue a request await crawler.queue(['https://example.net/', { url: 'https://example.org/' }]); // Queue multiple requests in different styles diff --git a/examples/override-function.js b/examples/override-function.js index da859b54..8b6d5bf1 100644 --- a/examples/override-function.js +++ b/examples/override-function.js @@ -2,18 +2,18 @@ const HCCrawler = require('headless-chrome-crawler'); (async () => { const crawler = await HCCrawler.launch({ - evaluatePage: (() => { + evaluatePage: () => { throw new Error("Global functions won't be called"); - }), - onSuccess: (result => { + }, + onSuccess: result => { console.log(`Got ${result.result.title} for ${result.options.url}.`); - }), + }, }); await crawler.queue({ url: 'https://example.com/', - evaluatePage: (() => ({ + evaluatePage: () => ({ title: $('title').text(), - })), + }), }); await crawler.onIdle(); await crawler.close(); diff --git a/examples/pause-resume.js b/examples/pause-resume.js index 415985db..712f0af3 100644 --- a/examples/pause-resume.js +++ b/examples/pause-resume.js @@ -4,9 +4,9 @@ const HCCrawler = require('headless-chrome-crawler'); const crawler = await HCCrawler.launch({ maxConcurrency: 1, maxRequest: 2, - onSuccess: (result => { + onSuccess: result => { console.log(`Requested ${result.options.url}.`); - }), + }, }); await crawler.queue('https://example.com/'); await crawler.queue('https://example.net/'); diff --git a/examples/priority-queue.js b/examples/priority-queue.js index c5fa4fb4..c9b5e180 100644 --- a/examples/priority-queue.js +++ b/examples/priority-queue.js @@ -3,9 +3,9 @@ const HCCrawler = require('headless-chrome-crawler'); (async () => { const crawler = await HCCrawler.launch({ maxConcurrency: 1, - onSuccess: (result => { + onSuccess: result => { console.log(`Requested ${result.options.url}.`); - }), + }, }); await crawler.queue({ url: 'https://example.com/', priority: 1 }); await crawler.queue({ url: 'https://example.net/', priority: 2 }); // This queue is requested before the previous queue diff --git a/examples/redis-cache.js b/examples/redis-cache.js index 5bb65f67..7da27828 100644 --- a/examples/redis-cache.js +++ b/examples/redis-cache.js @@ -5,9 +5,9 @@ const cache = new RedisCache({ host: '127.0.0.1', port: 6379 }); function launch(persistCache) { return HCCrawler.launch({ - onSuccess: (result => { + onSuccess: result => { console.log(`Requested ${result.options.url}.`); - }), + }, cache, persistCache, // Cache won't be cleared when closing the crawler if set true }); diff --git a/lib/crawler.js b/lib/crawler.js index 0c3ca058..29e72c39 100644 --- a/lib/crawler.js +++ b/lib/crawler.js @@ -32,14 +32,18 @@ class Crawler { /** * @param {!Puppeteer.Page} page * @param {!Object} options + * @param {!number} depth + * @param {string} previousUrl */ - constructor(page, options) { + constructor(page, options, depth, previousUrl) { this._page = page; this._options = options; + this._depth = depth; + this._previousUrl = previousUrl; } /** - * @return {!Promise} + * @return {!Promise} */ async crawl() { await this._prepare(); @@ -57,6 +61,9 @@ class Crawler { this._collectLinks(response.url), ]); return { + options: this._options, + depth: this._depth, + previousUrl: this._previousUrl, response: this._reduceResponse(response), redirectChain: this._getRedirectChain(response), result, @@ -248,7 +255,7 @@ class Crawler { } /** - * @return {!Promise} + * @return {!Promise} * @private */ async _screenshot() { @@ -266,7 +273,7 @@ class Crawler { /** * @param {!string} baseUrl - * @return {!Promise} + * @return {!Promise>} * @private */ async _collectLinks(baseUrl) { diff --git a/lib/hccrawler.js b/lib/hccrawler.js index 0cfd2ae5..4c93b375 100644 --- a/lib/hccrawler.js +++ b/lib/hccrawler.js @@ -54,6 +54,7 @@ const CONSTRUCTOR_OPTIONS = CONNECT_OPTIONS.concat(LAUNCH_OPTIONS).concat([ 'preRequest', 'onSuccess', 'onError', + 'customizeCrawl', ]); const EMPTY_TXT = ''; @@ -62,7 +63,7 @@ const deviceNames = Object.keys(devices); class HCCrawler extends EventEmitter { /** * @param {!Object=} options - * @return {!Promise} + * @return {!Promise} */ static async connect(options) { const browser = await Puppeteer.connect(pick(options, CONNECT_OPTIONS)); @@ -73,7 +74,7 @@ class HCCrawler extends EventEmitter { /** * @param {!Object=} options - * @return {!Promise} + * @return {!Promise} */ static async launch(options) { const browser = await Puppeteer.launch(pick(options, LAUNCH_OPTIONS)); @@ -134,6 +135,7 @@ class HCCrawler extends EventEmitter { this._preRequest = options.preRequest || null; this._onSuccess = options.onSuccess || null; this._onError = options.onError || null; + this._customCrawl = options.customCrawl || null; this._exportHeader(); this._queue.on('pull', (_options, depth, previousUrl) => this._startRequest(_options, depth, previousUrl)); this._browser.on('disconnected', () => void this.emit(HCCrawler.Events.Disconnected)); @@ -331,17 +333,15 @@ class HCCrawler extends EventEmitter { */ async _request(options, depth, previousUrl, retryCount = 0) { this.emit(HCCrawler.Events.RequestStarted, options); - const crawler = await this._newPage(options); - this.emit(HCCrawler.Events.NewPage, crawler.page()); + const crawler = await this._newCrawler(options, depth, previousUrl); try { - const res = await crawler.crawl(); + const res = await this._crawl(crawler); await crawler.close(); this.emit(HCCrawler.Events.RequestFinished, options); const requested = await this._checkRequestedRedirect(options, res.response); await this._markRequested(options); await this._markRequestedRedirects(options, res.redirectChain, res.response); if (requested) return []; - extend(res, { options, depth, previousUrl }); this._exportLine(res); await this._success(res); return res.links; @@ -546,11 +546,22 @@ class HCCrawler extends EventEmitter { /** * @param {!Object} options * @return {!Promise} + * @param {!number} depth + * @param {string} previousUrl * @private */ - async _newPage(options) { + async _newCrawler(options, depth, previousUrl) { const page = await this._browser.newPage(); - return new Crawler(page, options); + return new Crawler(page, options, depth, previousUrl); + } + /** + * @param {!Crawler} crawler + * @return {!Promise} + */ + async _crawl(crawler) { + if (!this._customCrawl) return crawler.crawl(); + const crawl = () => crawler.crawl.call(crawler); + return this._customCrawl(crawler.page(), crawl); } /** @@ -633,7 +644,6 @@ class HCCrawler extends EventEmitter { } HCCrawler.Events = { - NewPage: 'newpage', RequestStarted: 'requeststarted', RequestSkipped: 'requestskipped', RequestDisallowed: 'requestdisallowed', diff --git a/test/hccrawler/index.test.js b/test/hccrawler/index.test.js index 555f38c8..0630130a 100644 --- a/test/hccrawler/index.test.js +++ b/test/hccrawler/index.test.js @@ -198,19 +198,6 @@ describe('HCCrawler', () => { } }); - test('emits a newpage event', async () => { - let request; - let response; - this.crawler.on('newpage', page => { - page.on('request', _request => { request = _request; }); - page.on('response', _response => { response = _response; }); - }); - await this.crawler.queue(INDEX_PAGE); - await this.crawler.onIdle(); - expect(request.response()).toBe(response); - expect(this.onSuccess).toHaveBeenCalledTimes(1); - }); - test('crawls when the requested domain exactly matches allowed domains', async () => { let requestskipped = 0; this.crawler.on('requestskipped', () => { requestskipped += 1; }); @@ -864,6 +851,32 @@ describe('HCCrawler', () => { }); }); + describe('when the crawler is launched with the customCrawl function', () => { + describe('when the customCrawl sets page content to the result', () => { + async function customCrawl(page, crawl) { + const result = await crawl(); + result.content = await page.content(); + return result; + } + + beforeEach(async () => { + this.crawler = await HCCrawler.launch(extend({ + onSuccess: this.onSuccess, + customCrawl, + }, DEFAULT_OPTIONS)); + }); + + test('resolves the page content', async () => { + const content = `