From 598b06f3fa1c964e288a4d5d4ba40274289d84da Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Wed, 3 Jan 2018 06:20:24 +0900 Subject: [PATCH 1/2] Support expire option for redis cache --- cache/base.js | 16 ++++++++-------- cache/redis.js | 24 +++++++++++++++++------- cache/session.js | 12 ++++++------ exporter/base.js | 4 ++-- test/cache.test.js | 32 ++++++++++++++++++++++++-------- 5 files changed, 57 insertions(+), 31 deletions(-) diff --git a/cache/base.js b/cache/base.js index 315ddac3..7fabd076 100644 --- a/cache/base.js +++ b/cache/base.js @@ -3,28 +3,28 @@ */ class BaseCache { /** - * @param {!Object} settings + * @param {!Object=} settings */ constructor(settings) { - this._settings = settings; + this._settings = settings || {}; } /** - * @return {Promise} + * @return {!Promise} */ init() { throw new Error('Init is not overridden!'); } /** - * @return {Promise} + * @return {!Promise} */ close() { throw new Error('Close is not overridden!'); } /** - * @return {Promise} + * @return {!Promise} */ clear() { throw new Error('Clear is not overridden!'); @@ -32,7 +32,7 @@ class BaseCache { /** * @param {!string} key - * @return {Promise} + * @return {!Promise} */ get() { throw new Error('Get is not overridden!'); @@ -41,7 +41,7 @@ class BaseCache { /** * @param {!string} key * @param {!string} value - * @return {Promise} + * @return {!Promise} */ set() { throw new Error('Set is not overridden!'); @@ -49,7 +49,7 @@ class BaseCache { /** * @param {!string} key - * @return {Promise} + * @return {!Promise} */ remove() { throw new Error('Remove is not overridden!'); diff --git a/cache/redis.js b/cache/redis.js index 1c06c533..cbcf73ee 100644 --- a/cache/redis.js +++ b/cache/redis.js @@ -7,7 +7,7 @@ const redis = require('redis'); class RedisCache extends BaseCache { /** * @override - * @return {Promise} + * @return {!Promise} */ init() { this._client = redis.createClient(this._settings); @@ -15,7 +15,7 @@ class RedisCache extends BaseCache { } /** - * @return {Promise} + * @return {!Promise} * @override */ clear() { @@ -31,7 +31,7 @@ class RedisCache extends BaseCache { } /** - * @return {Promise} + * @return {!Promise} * @override */ close() { @@ -41,7 +41,7 @@ class RedisCache extends BaseCache { /** * @param {!string} key - * @return {Promise} + * @return {!Promise} * @override */ get(key) { @@ -59,7 +59,7 @@ class RedisCache extends BaseCache { /** * @param {!string} key * @param {!string} value - * @return {Promise} + * @return {!Promise} * @override */ set(key, value) { @@ -69,14 +69,24 @@ class RedisCache extends BaseCache { reject(error); return; } - resolve(); + if (!this._settings.expire) { + resolve(); + return; + } + this._client.expire(key, this._settings.expire, _error => { + if (_error) { + reject(_error); + return; + } + resolve(); + }); }); }); } /** * @param {!string} key - * @return {Promise} + * @return {!Promise} * @override */ remove(key) { diff --git a/cache/session.js b/cache/session.js index 5d31707b..34fee3b7 100644 --- a/cache/session.js +++ b/cache/session.js @@ -5,7 +5,7 @@ const BaseCache = require('./base'); */ class SessionCache extends BaseCache { /** - * @return {Promise} + * @return {!Promise} * @override */ init() { @@ -14,7 +14,7 @@ class SessionCache extends BaseCache { } /** - * @return {Promise} + * @return {!Promise} * @override */ clear() { @@ -23,14 +23,14 @@ class SessionCache extends BaseCache { } /** - * @return {Promise} + * @return {!Promise} * @override */ close() {} /** * @param {!string} key - * @return {Promise} + * @return {!Promise} * @override */ get(key) { @@ -40,7 +40,7 @@ class SessionCache extends BaseCache { /** * @param {!string} key * @param {!string} value - * @return {Promise} + * @return {!Promise} * @override */ set(key, value) { @@ -50,7 +50,7 @@ class SessionCache extends BaseCache { /** * @param {!string} key - * @return {Promise} + * @return {!Promise} * @override */ remove(key) { diff --git a/exporter/base.js b/exporter/base.js index 898a53c6..516c48b0 100644 --- a/exporter/base.js +++ b/exporter/base.js @@ -6,7 +6,7 @@ const { createWriteStream } = require('fs'); */ class BaseExporter { /** - * @param {Object=} settings + * @param {!Object=} settings */ constructor(settings) { this._settings = extend({ encoding: 'utf8' }, settings); @@ -19,7 +19,7 @@ class BaseExporter { } /** - * @return {Promise} + * @return {!Promise} */ onEnd() { return new Promise((resolve, reject) => { diff --git a/test/cache.test.js b/test/cache.test.js index f123fd05..c7ae53c6 100644 --- a/test/cache.test.js +++ b/test/cache.test.js @@ -1,6 +1,7 @@ const assert = require('assert'); const SessionCache = require('../cache/session'); const RedisCache = require('../cache/redis'); +const { delay } = require('../lib/helper'); const KEY = '35aa17374c'; const VALUE = '1'; @@ -12,13 +13,13 @@ describe('Cache', () => { it('passes test suites', () => ( cache.set(KEY, VALUE) .then(() => cache.get(KEY)) - .then(get => void assert.equal(get, VALUE)) + .then(value => void assert.equal(value, VALUE)) .then(() => cache.remove(KEY)) .then(() => cache.get(KEY)) - .then(get => void assert.equal(get, null)) + .then(value => void assert.equal(value, null)) .then(() => cache.set(KEY, VALUE)) .then(() => cache.clear()) - .then(get => void assert.equal(get, null)) + .then(value => void assert.equal(value, null)) )); } @@ -37,11 +38,26 @@ describe('Cache', () => { }); describe('RedisCache', () => { - beforeEach(() => { - cache = new RedisCache(); - return cache.init() - .then(() => cache.clear()); + context('constructed without expire option', () => { + beforeEach(() => { + cache = new RedisCache(); + return cache.init() + .then(() => cache.clear()); + }); + itPassesTestSuits(); + }); + context('constructed with expire = 1', () => { + beforeEach(() => { + cache = new RedisCache({ expire: 1 }); + return cache.init() + .then(() => cache.clear()); + }); + it('expires after wait', () => ( + cache.set(KEY, VALUE) + .then(() => delay(1500)) + .then(() => cache.get(KEY)) + .then(value => void assert.equal(value, null)) + )); }); - itPassesTestSuits(); }); }); From 8051f67bc00897cad27a6102c3c912eeab88d7a6 Mon Sep 17 00:00:00 2001 From: yujiosaka Date: Wed, 3 Jan 2018 07:05:09 +0900 Subject: [PATCH 2/2] Add README --- CHANGELOG.md | 1 + README.md | 35 ++++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1b50543..526a9d37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added - Support `obeyRobotsTxt` for [crawler.queue()](https://github.com/yujiosaka/headless-chrome-crawler#crawlerqueueoptions)'s options. +- Support `persist` for [RedisCache](https://github.com/yujiosaka/headless-chrome-crawler#rediscache)'s constructing options. ### changed diff --git a/README.md b/README.md index c2d1392f..06d5ab52 100644 --- a/README.md +++ b/README.md @@ -203,21 +203,21 @@ See [puppeteer.executablePath()](https://github.com/GoogleChrome/puppeteer/blob/ #### crawler.queue([options]) * `options` <[Object]> - * `url` <[String]> Url to navigate to. The url should include scheme, e.g. `https://`. + * `url` <[string]> Url to navigate to. The url should include scheme, e.g. `https://`. * `maxDepth` <[number]> Maximum depth for the crawler to follow links automatically, default to 1. Leave default to disable following links. * `priority` <[number]> Basic priority of queues, defaults to `1`. Priority with larger number is preferred. * `skipDuplicates` <[boolean]> Whether to skip duplicate requests, default to `null`. The request is considered to be the same if `url`, `userAgent`, `device` and `extraHeaders` are strictly the same. * `obeyRobotsTxt` <[boolean]> Whether to obey [robots.txt](https://developers.google.com/search/reference/robots_txt), default to `true`. - * `allowedDomains` <[Array]> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed. + * `allowedDomains` <[Array]> List of domains allowed to request. `www.example.com` will be allowed if `example.com` is listed. * `delay` <[number]> Number of milliseconds after each request, defaults to `0`. When delay is set, `maxConcurrency` option must be `1`. * `retryCount` <[number]> Number of limit when retry fails, defaults to `3`. * `retryDelay` <[number]> Number of milliseconds after each retry fails, defaults to `10000`. * `jQuery` <[boolean]> Whether to automatically add [jQuery](https://jquery.com) tag to page, defaults to `true`. - * `device` <[String]> Device to emulate. Available devices are listed [here](https://github.com/GoogleChrome/puppeteer/blob/master/DeviceDescriptors.js). - * `username` <[String]> Username for basic authentication. pass `null` if it's not necessary. + * `device` <[string]> Device to emulate. Available devices are listed [here](https://github.com/GoogleChrome/puppeteer/blob/master/DeviceDescriptors.js). + * `username` <[string]> Username for basic authentication. pass `null` if it's not necessary. * `screenshot` <[Object]> Screenshot option, defaults to `null`. This option is passed to [Puppeteer's page.screenshot()](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#pagescreenshotoptions). Pass `null` or leave default to disable screenshot. - * `password` <[String]> Password for basic authentication. pass `null` if it's not necessary. - * `userAgent` <[String]> User agent string to override in this page. + * `password` <[string]> Password for basic authentication. pass `null` if it's not necessary. + * `userAgent` <[string]> User agent string to override in this page. * `extraHeaders` <[Object]> An object containing additional headers to be sent with every request. All header values must be strings. * `preRequest(options)` <[Function]> Function to do anything like modifying `options` before each request. You can also return `false` if you want to skip the request. * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. @@ -226,8 +226,8 @@ See [puppeteer.executablePath()](https://github.com/GoogleChrome/puppeteer/blob/ * `response` <[Object]> * `response` <[Object]> * `ok` <[boolean]> whether the status code in the range 200-299 or not. - * `status` <[String]> status code of the request. - * `url` <[String]> Last requested url. + * `status` <[string]> status code of the request. + * `url` <[string]> Last requested url. * `headers` <[Object]> Response headers. * `options` <[Object]> [crawler.queue()](#crawlerqueueoptions)'s options with default values. * `result` <[Serializable]> The result resolved from `evaluatePage()` option. @@ -348,7 +348,7 @@ Emitted when the browser instance is disconnected. ### class: SessionCache -`SessionCache` is the [HCCrawler.connect()](#hccrawlerconnectoptions)'s default `cache` option. By default, the crawler remembers already requested urls on its memory. Pass `null` to the option in order to disable it. +`SessionCache` is the [HCCrawler.connect()](#hccrawlerconnectoptions)'s default `cache` option. By default, the crawler remembers already requested urls on its memory. ```js const HCCrawler = require('headless-chrome-crawler'); @@ -360,9 +360,12 @@ HCCrawler.launch({ cache: null }); ### class: RedisCache -Passing a `RedisCache` object to the [HCCrawler.connect()](#hccrawlerconnectoptions)'s `cache` option allows you to persist requested urls in Redis and prevents from requesting same urls in a distributed servers' environment. It also works well with its `persistCache` option to be true. +* `options` <[Object]> + * `expire` <[number]> Seconds to expires cache after setting each value, default to `null`. + +Passing a `RedisCache` object to the [HCCrawler.connect()](#hccrawlerconnectoptions)'s `cache` option allows you to persist requested urls and [robots.txt](https://developers.google.com/search/reference/robots_txt) in [Redis](https://redis.io) so that it prevent from requesting same urls in a distributed servers' environment. It also works well with its `persistCache` option to be true. -Its constructing options are passed to [NodeRedis's redis.createClient()](https://github.com/NodeRedis/node_redis#rediscreateclient)'s options. +Other constructing options are passed to [NodeRedis's redis.createClient()](https://github.com/NodeRedis/node_redis#rediscreateclient)'s options. ```js const HCCrawler = require('headless-chrome-crawler'); @@ -385,7 +388,10 @@ See [here](https://github.com/yujiosaka/headless-chrome-crawler/blob/master/exam ### class: CSVExporter -Both `file` and `fields` options are required. `separator` option is optional, defaults to `,`. +* `options` <[Object]> + * `file` <[string]> File path to export output. + * `fields` <[Array]> List of fields to be used for columns. This option is also used for the headers. + * `separator` Character to separate columns. ```js const HCCrawler = require('headless-chrome-crawler'); @@ -405,7 +411,10 @@ HCCrawler.launch({ exporter }) ### class: JSONLineExporter -Only `file` option is required. You can also pass `fields` and `jsonReplacer` options. Passing `fields` option limits the fields of the results, and `jsonReplacer` is used for [JSON.stringify()](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify)'s second argument, which is useful to sorts keys always in the same order. +* `options` <[Object]> + * `file` <[string]> File path to export output. + * `fields` <[Array]> List of fields to be filtered in json, defaults to `null`. Leave default not to filter fields. + * `jsonReplacer` <[Function]> Function that alters the behavior of the stringification process, defaults to `null`. This is useful to sorts keys always in the same order. ```js const HCCrawler = require('headless-chrome-crawler');