feat(hccrawler): replace newpage by custom crawl

yujiosaka · Jun 10, 2018 · 96d13f2 · 96d13f2
1 parent 4f5bac5
commit 96d13f2
Show file tree

Hide file tree

Showing 6 changed files with 96 additions and 39 deletions.
diff --git a/docs/API.md b/docs/API.md
@@ -22,7 +22,6 @@
   * [crawler.queueSize()](#crawlerqueuesize)
   * [crawler.pendingQueueSize()](#crawlerpendingqueuesize)
   * [crawler.requestedCount()](#crawlerrequestedcount)
-  * [event: 'newpage'](#event-newpage)
   * [event: 'requestdisallowed'](#event-requestdisallowed)
   * [event: 'requeststarted'](#event-requeststarted)
   * [event: 'requestskipped'](#event-requestskipped)
@@ -280,12 +279,6 @@ This method clears the cache when it's used.
 
 * returns: <[number]> The count of total requests.
 
-### event: 'newpage'
-
-* `page` <[Page]>
-
-Emitted when a [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s page is opened.
-
 ### event: 'requestdisallowed'
 
 * `options` <[Object]>

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 - Set `previousUrl` to `onSuccess` argument.
 - Set `options`, `depth`, `previousUrl` to errors.
+- Support `customCrawl` for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options.
+
+### Changed
+
+- Drop `newpage` event.
 
 ### Fixed
 
@@ -24,7 +29,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Support `cookies` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options.
 - Make `onSuccess` pass `cookies` in the response.
 
-### changed
+### Changed
 
 - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.4.0.
 
@@ -36,7 +41,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Emit `requestdisallowed` event.
 - Make `onSuccess` pass `redirectChain` in the response.
 
-### changed
+### Changed
 
 - Bump Node.js version up to 8.10.0.
 - Update [Puppeteer](https://github.com/GoogleChrome/puppeteer) version to 1.3.0.
@@ -68,7 +73,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ## [1.3.4] - 2018-02-22
 
-### changed
+### Changed
 
 - Drop `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options.
 
@@ -79,7 +84,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Emit `newpage` event.
 - Support `deniedDomains` and `depthPriority` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options.
 
-### changed
+### Changed
 
 -  Allow `allowedDomains` option to accept a list of regular expressions.
 
@@ -106,7 +111,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Add [HCCrawler.defaultArgs()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerdefaultargs) method.
 - Emit `requestretried` event.
 
-### changed
+### Changed
 
 - Use `cache` option not only for remembering already requested URLs but for request queue for distributed environments.
 - Moved `onSuccess`, `onError` and `maxDepth` options from [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions) to [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions).
@@ -117,7 +122,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Support `obeyRobotsTxt` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#crawlerqueueoptions)'s options.
 - Support `persist` for [RedisCache](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#rediscache)'s constructing options.
 
-### changed
+### Changed
 
 - Make `cache` to be required for [HCCrawler.connect()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerconnectoptions) and [HCCrawler.launch()](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/docs/API.md#hccrawlerlaunchoptions)'s options.
 - Provide `skipDuplicates` to remember and skip duplicate URLs, instead of passing `null` to `cache` option.

diff --git a/examples/custom-crawl.js b/examples/custom-crawl.js
@@ -0,0 +1,29 @@
+const HCCrawler = require('headless-chrome-crawler');
+
+(async () => {
+  const crawler = await HCCrawler.launch({
+    customCrawl: async (page, crawl) => {
+      // You can access the page object before requests
+      await page.setRequestInterception(true);
+      page.on('request', request => {
+        if (request.url().endsWith('/')) {
+          request.continue();
+        } else {
+          request.abort();
+        }
+      });
+      // The result contains options, links, cookies and etc.
+      const result = await crawl();
+      // You can access the page object after requests
+      result.content = await page.content();
+      // You need to extend and return the crawled result
+      return result;
+    },
+    onSuccess: result => {
+      console.log(`Got ${result.content} for ${result.options.url}.`);
+    },
+  });
+  await crawler.queue('https://example.com/');
+  await crawler.onIdle();
+  await crawler.close();
+})();
diff --git a/lib/crawler.js b/lib/crawler.js
@@ -32,14 +32,18 @@ class Crawler {
   /**
    * @param {!Puppeteer.Page} page
    * @param {!Object} options
+   * @param {!number} depth
+   * @param {string} previousUrl
    */
-  constructor(page, options) {
+  constructor(page, options, depth, previousUrl) {
     this._page = page;
     this._options = options;
+    this._depth = depth;
+    this._previousUrl = previousUrl;
   }
 
   /**
-   * @return {!Promise}
+   * @return {!Promise<!Object>}
    */
   async crawl() {
     await this._prepare();
@@ -57,6 +61,9 @@ class Crawler {
       this._collectLinks(response.url),
     ]);
     return {
+      options: this._options,
+      depth: this._depth,
+      previousUrl: this._previousUrl,
       response: this._reduceResponse(response),
       redirectChain: this._getRedirectChain(response),
       result,
@@ -248,7 +255,7 @@ class Crawler {
   }
 
   /**
-   * @return {!Promise}
+   * @return {!Promise<!Buffer|!String>}
    * @private
    */
   async _screenshot() {
@@ -266,7 +273,7 @@ class Crawler {
 
   /**
    * @param {!string} baseUrl
-   * @return {!Promise}
+   * @return {!Promise<!Array<!string>>}
    * @private
    */
   async _collectLinks(baseUrl) {

diff --git a/lib/hccrawler.js b/lib/hccrawler.js
@@ -54,6 +54,7 @@ const CONSTRUCTOR_OPTIONS = CONNECT_OPTIONS.concat(LAUNCH_OPTIONS).concat([
   'preRequest',
   'onSuccess',
   'onError',
+  'customizeCrawl',
 ]);
 const EMPTY_TXT = '';
 
@@ -62,7 +63,7 @@ const deviceNames = Object.keys(devices);
 class HCCrawler extends EventEmitter {
   /**
    * @param {!Object=} options
-   * @return {!Promise}
+   * @return {!Promise<!HCCrawler>}
    */
   static async connect(options) {
     const browser = await Puppeteer.connect(pick(options, CONNECT_OPTIONS));
@@ -73,7 +74,7 @@ class HCCrawler extends EventEmitter {
 
   /**
    * @param {!Object=} options
-   * @return {!Promise}
+   * @return {!Promise<!HCCrawler>}
    */
   static async launch(options) {
     const browser = await Puppeteer.launch(pick(options, LAUNCH_OPTIONS));
@@ -134,6 +135,7 @@ class HCCrawler extends EventEmitter {
     this._preRequest = options.preRequest || null;
     this._onSuccess = options.onSuccess || null;
     this._onError = options.onError || null;
+    this._customCrawl = options.customCrawl || null;
     this._exportHeader();
     this._queue.on('pull', (_options, depth, previousUrl) => this._startRequest(_options, depth, previousUrl));
     this._browser.on('disconnected', () => void this.emit(HCCrawler.Events.Disconnected));
@@ -331,17 +333,15 @@ class HCCrawler extends EventEmitter {
    */
   async _request(options, depth, previousUrl, retryCount = 0) {
     this.emit(HCCrawler.Events.RequestStarted, options);
-    const crawler = await this._newPage(options);
-    this.emit(HCCrawler.Events.NewPage, crawler.page());
+    const crawler = await this._newCrawler(options, depth, previousUrl);
     try {
-      const res = await crawler.crawl();
+      const res = await this._crawl(crawler);
       await crawler.close();
       this.emit(HCCrawler.Events.RequestFinished, options);
       const requested = await this._checkRequestedRedirect(options, res.response);
       await this._markRequested(options);
       await this._markRequestedRedirects(options, res.redirectChain, res.response);
       if (requested) return [];
-      extend(res, { options, depth, previousUrl });
       this._exportLine(res);
       await this._success(res);
       return res.links;
@@ -546,11 +546,22 @@ class HCCrawler extends EventEmitter {
   /**
    * @param {!Object} options
    * @return {!Promise<!Crawler>}
+   * @param {!number} depth
+   * @param {string} previousUrl
    * @private
    */
-  async _newPage(options) {
+  async _newCrawler(options, depth, previousUrl) {
     const page = await this._browser.newPage();
-    return new Crawler(page, options);
+    return new Crawler(page, options, depth, previousUrl);
+  }
+  /**
+   * @param {!Crawler} crawler
+   * @return {!Promise<!Object>}
+   */
+  async _crawl(crawler) {
+    if (!this._customCrawl) return crawler.crawl();
+    const crawl = () => crawler.crawl.call(crawler);
+    return this._customCrawl(crawler.page(), crawl);
   }
 
   /**
@@ -633,7 +644,6 @@ class HCCrawler extends EventEmitter {
 }
 
 HCCrawler.Events = {
-  NewPage: 'newpage',
   RequestStarted: 'requeststarted',
   RequestSkipped: 'requestskipped',
   RequestDisallowed: 'requestdisallowed',

diff --git a/test/hccrawler/index.test.js b/test/hccrawler/index.test.js
@@ -198,19 +198,6 @@ describe('HCCrawler', () => {
           }
         });
 
-        test('emits a newpage event', async () => {
-          let request;
-          let response;
-          this.crawler.on('newpage', page => {
-            page.on('request', _request => { request = _request; });
-            page.on('response', _response => { response = _response; });
-          });
-          await this.crawler.queue(INDEX_PAGE);
-          await this.crawler.onIdle();
-          expect(request.response()).toBe(response);
-          expect(this.onSuccess).toHaveBeenCalledTimes(1);
-        });
-
         test('crawls when the requested domain exactly matches allowed domains', async () => {
           let requestskipped = 0;
           this.crawler.on('requestskipped', () => { requestskipped += 1; });
@@ -864,6 +851,32 @@ describe('HCCrawler', () => {
         });
       });
 
+      describe('when the crawler is launched with the customCrawl function', () => {
+        describe('when the customCrawl sets page content to the result', () => {
+          async function customCrawl(page, crawl) {
+            const result = await crawl();
+            result.content = await page.content();
+            return result;
+          }
+
+          beforeEach(async () => {
+            this.crawler = await HCCrawler.launch(extend({
+              onSuccess: this.onSuccess,
+              customCrawl,
+            }, DEFAULT_OPTIONS));
+          });
+
+          test('resolves the page content', async () => {
+            const content = `<h1>Welcome to ${INDEX_PAGE}</h1>`;
+            this.server.setContent('/', content);
+            await this.crawler.queue(INDEX_PAGE);
+            await this.crawler.onIdle();
+            expect(this.onSuccess).toHaveBeenCalledTimes(1);
+            expect(this.onSuccess.mock.calls[0][0].content).toContain(content);
+          });
+        });
+      });
+
       describe('when the crawler is launched with the exporter option', () => {
         function removeTemporaryFile(file) {
           return new Promise(resolve => {