Skip to content

Commit

Permalink
Merge pull request #26 from yujiosaka/pluggable_cache_storage
Browse files Browse the repository at this point in the history
Add pluggable cache
  • Loading branch information
yujiosaka authored Dec 10, 2017
2 parents 4610dca + 2eb4632 commit ac2e099
Show file tree
Hide file tree
Showing 21 changed files with 575 additions and 64 deletions.
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ jobs:
build:
docker:
- image: circleci/node:6.10
- image: redis
steps:
- checkout
- run:
Expand Down
18 changes: 8 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ Crawlers based on simple requests to html files are generally fast. However, it
Powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer), headless-chrome-crawler allows you to scrape those single page applications with the following features:

* Configure concurrency, delay and retries
* Pluggable cache to skip duplicate requests
* Cancel requests by conditions
* Pause and resume at any time
* Insert [jQuery](https://jquery.com) automatically
* Priority queue
* Device emulation
Expand All @@ -34,24 +36,25 @@ The basic API of headless-chrome-crawler is inspired by that of [node-crawler](h
const HCCrawler = require('headless-chrome-crawler');

HCCrawler.launch({
// Function to be evaluated in browsers
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
// Function to be called with evaluated results from browsers
onSuccess: (result => {
console.log('onSuccess', result); // resolves status, options and evaluated result.
console.log('onSuccess', result);
}),
})
.then(crawler => {
// Queue a single request
crawler.queue('https://example.com');
crawler.queue('https://example.com/');
// Queue multiple requests
crawler.queue(['https://example.net', 'https://example.org']);
crawler.queue(['https://example.net/', 'https://example.org/']);
// Queue a query custom options
crawler.queue({
jQuery: false,
url: 'https://example.com',
url: 'https://example.com/',
evaluatePage: (() => ({
title: document.title,
h1: document.getElementsByTagName('h1')[0].innerText,
Expand Down Expand Up @@ -81,7 +84,6 @@ See [here](https://github.com/yujiosaka/headless-chrome-crawler/tree/master/exam
* [crawler.version()](#crawlerversion)
* [crawler.wsEndpoint()](#crawlerwsendpoint)
* [crawler.onIdle()](#crawleronidle)
* [crawler.onEnd()](#crawleronend)
* [crawler.queueSize](#crawlerqueuesize)
* [crawler.pendingQueueSize](#crawlerpendingqueuesize)
* [crawler.requestedCount](#crawlerrequestedcount)
Expand Down Expand Up @@ -194,10 +196,6 @@ See [Puppeteer's browser.wsEndpoint()](https://github.com/GoogleChrome/puppeteer

- returns: <[Promise]> Promise which is resolved when queues become empty.

#### crawler.onEnd()

- returns: <[Promise]> Promise which is resolved when request reaches max.

#### crawler.queueSize

* returns: <[number]> The size of queues. This property is read only.
Expand Down
7 changes: 3 additions & 4 deletions examples/delay.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@ HCCrawler.launch({
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
})
.then(crawler => {
crawler.queue({ url: 'https://example.com' });
crawler.queue({ url: 'https://example.net' });
crawler.queue({ url: 'https://example.org' });
crawler.queue({ url: 'https://example.com/' });
crawler.queue({ url: 'https://example.net/' });
crawler.queue({ url: 'https://example.org/' });
crawler.onIdle()
.then(() => crawler.close());
});
3 changes: 1 addition & 2 deletions examples/disable-jquery.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ HCCrawler.launch({
// $ is undefined so that causes an error
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
Expand All @@ -18,7 +17,7 @@ HCCrawler.launch({
}),
})
.then(crawler => {
crawler.queue('https://example.com');
crawler.queue('https://example.com/');
crawler.onIdle()
.then(() => crawler.close());
});
6 changes: 2 additions & 4 deletions examples/emulate-device.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,14 @@ HCCrawler.launch({
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
})
.then(crawler => {
crawler.queue({ url: 'https://example.com', device: 'iPhone 6 Plus' });
crawler.queue({ url: 'https://example.com', device: 'iPad' });
crawler.queue({ url: 'https://example.com', device: 'Nexus 7' });
crawler.queue({ url: 'https://example.com/', device: 'iPhone 6 Plus' });
crawler.queue({ url: 'https://example.com/', device: 'Nexus 7' });
crawler.onIdle()
.then(() => crawler.close());
});
5 changes: 2 additions & 3 deletions examples/multiple-queue.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@ HCCrawler.launch({
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
})
.then(crawler => {
crawler.queue('https://example.com'); // one URL
crawler.queue(['https://example.net', { url: 'https://example.org' }]); // multiple URLs in different styles.
crawler.queue('https://example.com/'); // one URL
crawler.queue(['https://example.net/', { url: 'https://example.org/' }]); // multiple URLs in different styles.
crawler.onIdle()
.then(() => crawler.close());
});
3 changes: 1 addition & 2 deletions examples/override-function.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@ HCCrawler.launch({
})
.then(crawler => {
crawler.queue({
url: 'https://example.com',
url: 'https://example.com/',
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
Expand Down
25 changes: 25 additions & 0 deletions examples/pause-resume.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
const HCCrawler = require('../');

HCCrawler.launch({
maxConcurrency: 1,
maxRequest: 2,
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
})
.then(crawler => {
crawler.queue({ url: 'https://example.com/' });
crawler.queue({ url: 'https://example.net/' });
crawler.queue({ url: 'https://example.org/' }); // The queue won't be requested until resumed
crawler.onIdle()
.then(() => {
crawler.setMaxRequest(3);
crawler.resume();
return crawler.onIdle();
})
.then(() => crawler.close());
});
7 changes: 3 additions & 4 deletions examples/priority-queue.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,15 @@ HCCrawler.launch({
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
p: $('p').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
})
.then(crawler => {
crawler.queue({ url: 'https://example.com' }); // First queue will be requested first regardless of priority
crawler.queue({ url: 'https://example.net', priority: 1 });
crawler.queue({ url: 'https://example.org', priority: 2 }); // This queue is requested before the previous queue
crawler.queue({ url: 'https://example.com/' }); // First queue will be requested first regardless of priority
crawler.queue({ url: 'https://example.net/', priority: 1 });
crawler.queue({ url: 'https://example.org/', priority: 2 }); // This queue is requested before the previous queue
crawler.onIdle()
.then(() => crawler.close());
});
20 changes: 20 additions & 0 deletions examples/session-cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
const HCCrawler = require('../');

HCCrawler.launch({
maxConcurrency: 1,
evaluatePage: (() => ({
title: $('title').text(),
h1: $('h1').text(),
})),
onSuccess: (result => {
console.log('onSuccess', result);
}),
cache: new HCCrawler.SessionCache(),
})
.then(crawler => {
crawler.queue('https://example.com/');
crawler.queue('https://example.net/');
crawler.queue('https://example.com/'); // The queue won't be requested
crawler.onIdle()
.then(() => crawler.close());
});
10 changes: 3 additions & 7 deletions examples/skip-duplicates.js → examples/skip-request.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
const HCCrawler = require('../');

const requestedObj = {};

HCCrawler.launch({
maxConcurrency: 1,
evaluatePage: (() => ({
Expand All @@ -10,18 +8,16 @@ HCCrawler.launch({
p: $('p').text(),
})),
onSuccess: (result => {
requestedObj[result.options.url] = true;
console.log('onSuccess', result);
}),
preRequest: (options => {
if (requestedObj[options.url]) return false;
if (options.url === 'https://example.net/') return false;
return true;
}),
})
.then(crawler => {
crawler.queue('https://example.com');
crawler.queue('https://example.net');
crawler.queue('https://example.com'); // The queue won't be requested
crawler.queue('https://example.com/');
crawler.queue('https://example.net/');
crawler.onIdle()
.then(() => crawler.close());
});
86 changes: 86 additions & 0 deletions lib/cache/base.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
const _ = require('lodash');
const { hash, jsonStableReplacer } = require('../helper');

const OMITTED_HASH_FIELDS = [
'priority',
'allowedDomains',
'delay',
'retryCount',
'retryDelay',
'jQuery',
'username',
'password',
'preRequest',
'evaluatePage',
'onSuccess',
'onError',
'timeout',
'waitUntil',
];
const MAX_LENGTH = 10;

class BaseCache {
constructor(settings) {
this._settings = settings;
}

/**
* Initializing the cache storage
* @return {Promise} resolves when init operation completed
* @interface
*/
init() {
throw new Error('Init is not overridden!');
}

/**
* Closing the cache storage
* @return {Promise} resolves when close operation completed
* @interface
*/
close() {
throw new Error('Close is not overridden!');
}

/**
* Clearing the cache storage
* @return {Promise} resolves when clear operation completed
* @interface
*/
clear() {
throw new Error('Clear is not overridden!');
}

/**
* Method to check whether the requested options already exists in the cache storage
* @param {Object} options
* @return {Promise} resolves whether the requested options already exists
* @interface
*/
exists() {
throw new Error('Get is not overridden!');
}

/**
* Method to set the requested options to the cache storage
* @param {Object} options
* @return {Promise} resolves when set operation completed
* @interface
*/
set() {
throw new Error('Set is not overridden!');
}

/**
* Method to check whether the requested options already exists in the cache storage
* @param {Object} options
* @return {String} session cache key for the option
* @static
*/
static key(options) {
const json = JSON.stringify(_.omit(options, OMITTED_HASH_FIELDS), jsonStableReplacer);
return hash(json).substring(0, MAX_LENGTH);
}
}

module.exports = BaseCache;
3 changes: 3 additions & 0 deletions lib/cache/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
exports.BaseCache = require('./base');
exports.SessionCache = require('./session');
exports.RedisCache = require('./redis');
Loading

0 comments on commit ac2e099

Please sign in to comment.