Skip to content

Commit

Permalink
Merge pull request #1 from yujiosaka/devtools
Browse files Browse the repository at this point in the history
Devtools
  • Loading branch information
yujiosaka authored Dec 5, 2017
2 parents 9569975 + e7d48cc commit 9138361
Show file tree
Hide file tree
Showing 14 changed files with 1,685 additions and 39 deletions.
17 changes: 17 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
version: 2
jobs:
build:
docker:
- image: circleci/node:6.10
steps:
- checkout
- restore_cache:
keys:
- v1-dependencies-{{ checksum "package.json" }}
- v1-dependencies-
- run: yarn install
- save_cache:
paths:
- node_modules
key: v1-dependencies-{{ checksum "package.json" }}
- run: yarn test
12 changes: 12 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
root = true

[*.js]
indent_style = space
indent_size = 2
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[package.json]
indent_size = 2
1 change: 1 addition & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
coverage/
3 changes: 2 additions & 1 deletion .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ module.exports = {
"class-methods-use-this": 0,
"arrow-parens": ["warn", "as-needed"],
"no-underscore-dangle": 0,
"no-console": 0
"no-console": 0,
"no-void": 0,
}
}
8 changes: 2 additions & 6 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
logs
*.log
npm-debug.log*
lib-cov
coverage
node_modules/
.npm
coverage/
.yarn-integrity
.eslintcache
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer), headless-chro
### Installation

```
npm i headless-chrome-crawler
yarn add headless-chrome-crawler
```

> **Note**: headless-chrome-crawler is powered by [Puppeteer](https://github.com/GoogleChrome/puppeteer). With installation, it automatically downloads a recent version of Chromium. To skip the download, see [Environment variables](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#environment-variables).
Expand Down Expand Up @@ -138,3 +138,24 @@ When both defined, hccrawler.queue's options are always preferred.
#### hccrawler.queueSize

* returns: <[number]> The size of queues. This property is read only.

## Debugging tips

### Puppeteer.launch's options

[hccrawler.launch](#chcrawlerlaunchoptions)'s options are passed straight to [Puppeteer.launch API](https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#puppeteerlaunchoptions).
It may be useful to set the `headless` and `slowMo` options so that you can see what is going on.

```js
hccrawler.launch({ headless: false, slowMo: 10 });
```

### Enable debug logging

All requests and browser's logs are logged via the [debug]'(https://github.com/visionmedia/debug)' module under the `hccrawler` namespace.

```
env DEBUG="hccrawler:*" node script.js
env DEBUG="hccrawler:request" node script.js
env DEBUG="hccrawler:browser" node script.js
```
10 changes: 0 additions & 10 deletions examples/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion examples/skip-duplicates.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ const hccrawler = new HCCrawler({
p: $('p').text(),
})),
onSuccess: (result => {
requestedObj[result.url] = true;
requestedObj[result.options.url] = true;
console.log('onSuccess', result);
}),
shouldRequest: (options => {
Expand Down
44 changes: 29 additions & 15 deletions lib/hccrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ const _ = require('lodash');
const PQueue = require('p-queue');
const puppeteer = require('puppeteer');
const devices = require('puppeteer/DeviceDescriptors');
const debugBrowser = require('debug')('hccrawler:browser');
const debugRequest = require('debug')('hccrawler:request');
const { delay } = require('./helper');

const deviceNames = Object.keys(devices);
Expand Down Expand Up @@ -51,34 +53,46 @@ class HCCrawler {
}

_request(options, retryCount = 0) {
if (retryCount === 0) debugRequest(`Start requesting ${options.url}`);
return Promise.resolve(options.shouldRequest ? options.shouldRequest(options) : true)
.then(shouldRequest => {
if (!shouldRequest) return Promise.resolve();
if (!shouldRequest) {
debugRequest(`Skip requesting ${options.url}`);
return Promise.resolve();
}
return this.browser.newPage()
.then(page => {
page.on('console', (msg => void debugBrowser(msg.text)));
const credentials = _.pick(options, ['username', 'password']);
if (options.username || options.password) page.authenticate(credentials);
if (options.captureConsole) page.on('console', this._captureConsole);
return (options.device ? page.emulate(devices[options.device]) : Promise.resolve())
.then(() => page.goto(options.url, _.pick(options, ['timeout', 'waitUntil'])))
.then(res => (
(options.jQuery ? page.addScriptTag({ path: jQueryPath }) : Promise.resolve())
.then(() => page.evaluate(options.evaluatePage))
const emulate = options.device
? page.emulate(devices[options.device])
: Promise.resolve();
return emulate.then(() => page.goto(options.url, _.pick(options, ['timeout', 'waitUntil'])))
.then(res => {
debugRequest(`Opened page for ${options.url}`);
const addScriptTag = options.jQuery
? page.addScriptTag({ path: jQueryPath })
: Promise.resolve();
return addScriptTag.then(() => page.evaluate(options.evaluatePage))
.then(result => options.onSuccess({ status: res.status, options, result }))
.then(() => void debugRequest(`End requesting ${options.url}`))
.then(() => page.close())
.then(() => delay(options.delay))
));
.then(() => void debugRequest(`Closed page for ${options.url}`))
.then(() => delay(options.delay));
});
});
})
.catch(err => {
if (retryCount >= options.retryCount) throw new Error(`Retried too many times while requesting ${options.url}!`, err);
if (retryCount >= options.retryCount) throw new Error(`Retry give-up for requesting ${options.url}!`, err);
debugRequest(`Retry requesting ${options.url} ${retryCount + 1} times`);
return delay(options.retryDelay).then(() => this._request(options, retryCount + 1));
})
.catch(options.onError || _.noop);
}

_captureConsole(msg) {
console[msg.type](`[browser] ${msg.text}`);
.catch(err => {
debugRequest(`Retry give-up for requesting ${options.url} after ${retryCount} tries`);
const onError = options.onError || _.noop;
return onError(err);
});
}

close() {
Expand Down
10 changes: 7 additions & 3 deletions lib/helper.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
exports.delay = function delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
};
class Util {
static delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}

module.exports = Util;
9 changes: 7 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
"license": "MIT",
"author": "Yuji Isobe",
"scripts": {
"coverage": "istanbul cover _mocha",
"lint": "eslint --quiet -f codeframe . || eslint .",
"test": "npm run lint"
"test": "npm run lint && npm run coverage && mocha"
},
"repository": {
"type": "git",
Expand All @@ -18,6 +19,7 @@
},
"homepage": "https://github.com/yujiosaka/headless-chrome-crawler#readme",
"dependencies": {
"debug": "3.1.0",
"jquery": "3.2.1",
"lodash": "4.17.4",
"p-queue": "2.3.0",
Expand All @@ -26,7 +28,10 @@
"devDependencies": {
"eslint": "4.11.0",
"eslint-config-airbnb": "16.1.0",
"eslint-plugin-import": "2.8.0"
"eslint-plugin-import": "2.8.0",
"istanbul": "0.4.5",
"mocha": "4.0.1",
"power-assert": "1.4.4"
},
"keywords": [
"headless",
Expand Down
6 changes: 6 additions & 0 deletions test/.eslintrc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
module.exports = {
"extends": "../.eslintrc.js",
"env": {
"mocha": true,
},
};
26 changes: 26 additions & 0 deletions test/helper.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
const assert = require('assert');
const { delay } = require('../lib/helper');

describe('Helper', () => {
describe('Helper.delay', () => {
it('should wait until shorter delay', () => {
let waited = false;
delay(50).then(() => {
waited = true;
});
return delay(100).then(() => {
assert.equal(waited, true);
});
});

it('should not wait until longer delay', () => {
let waited = false;
delay(100).then(() => {
waited = true;
});
return delay(50).then(() => {
assert.equal(waited, false);
});
});
});
});
Loading

0 comments on commit 9138361

Please sign in to comment.