Skip to content

Commit

Permalink
Merge pull request #222 from yujiosaka/emit-requestdisallowed-event
Browse files Browse the repository at this point in the history
Emit requestdisallowed event
  • Loading branch information
yujiosaka authored Apr 20, 2018
2 parents a3498f7 + 87f7909 commit 37ded4f
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 14 deletions.
7 changes: 7 additions & 0 deletions API.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
* [crawler.pendingQueueSize()](#crawlerpendingqueuesize)
* [crawler.requestedCount()](#crawlerrequestedcount)
* [event: 'newpage'](#event-newpage)
* [event: 'requestdisallowed'](#event-requestdisallowed)
* [event: 'requeststarted'](#event-requeststarted)
* [event: 'requestskipped'](#event-requestskipped)
* [event: 'requestfinished'](#event-requestfinished)
Expand Down Expand Up @@ -253,6 +254,12 @@ This method clears the cache when it's used.

Emitted when a [Puppeteer](https://github.com/GoogleChrome/puppeteer)'s page is opened.

### event: 'requestdisallowed'

* `options` <[Object]>

Emitted when a request is disallowed by robots.txt.

### event: 'requeststarted'

* `options` <[Object]>
Expand Down
8 changes: 6 additions & 2 deletions lib/hccrawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ class HCCrawler extends EventEmitter {
this.emit(HCCrawler.Events.RequestSkipped, options);
return;
}
const allowed = await this._checkAllowedRobots(options);
if (!allowed) {
this.emit(HCCrawler.Events.RequestDisallowed, options);
return;
}
await this._followSitemap(options, depth);
const links = await this._request(options, depth);
this._checkRequestCount();
Expand All @@ -308,8 +313,6 @@ class HCCrawler extends EventEmitter {
if (requested) return true;
const shouldRequest = await this._shouldRequest(options);
if (!shouldRequest) return true;
const allowedRobot = await this._checkAllowedRobots(options);
if (!allowedRobot) return true;
return false;
}

Expand Down Expand Up @@ -585,6 +588,7 @@ HCCrawler.Events = {
NewPage: 'newpage',
RequestStarted: 'requeststarted',
RequestSkipped: 'requestskipped',
RequestDisallowed: 'requestdisallowed',
RequestFinished: 'requestfinished',
RequestRetried: 'requestretried',
RequestFailed: 'requestfailed',
Expand Down
24 changes: 12 additions & 12 deletions test/hccrawler/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -283,42 +283,42 @@ describe('HCCrawler', function () {

it('crawls when the path is allowed by the robots.txt', async function () {
server.setContent('/robots.txt', 'User-agent: *\nAllow: /');
let requestskipped = 0;
crawler.on('requestskipped', () => { requestskipped += 1; });
let requestdisallowed = 0;
crawler.on('requestdisallowed', () => { requestdisallowed += 1; });
await crawler.queue(INDEX_PAGE);
await crawler.onIdle();
assert.equal(requestskipped, 0);
assert.equal(requestdisallowed, 0);
assert.equal(onSuccess.callCount, 1);
});

it('skips crawling when the path is not allowed by the robots.txt', async function () {
server.setContent('/robots.txt', 'User-agent: *\nDisallow: /');
let requestskipped = 0;
crawler.on('requestskipped', () => { requestskipped += 1; });
let requestdisallowed = 0;
crawler.on('requestdisallowed', () => { requestdisallowed += 1; });
await crawler.queue(INDEX_PAGE);
await crawler.onIdle();
assert.equal(requestskipped, 1);
assert.equal(requestdisallowed, 1);
assert.equal(onSuccess.callCount, 0);
});

it('stops crawling when allowed and disallowed paths are mixed', async function () {
server.setContent('/robots.txt', 'User-agent: *\nDisallow: /2.html');
let requestskipped = 0;
crawler.on('requestskipped', () => { requestskipped += 1; });
let requestdisallowed = 0;
crawler.on('requestdisallowed', () => { requestdisallowed += 1; });
await crawler.queue(`${PREFIX}/1.html`);
await crawler.queue(`${PREFIX}/2.html`);
await crawler.onIdle();
assert.equal(requestskipped, 1);
assert.equal(requestdisallowed, 1);
assert.equal(onSuccess.callCount, 1);
});

it('does not obey the robots.txt with obeyRobotsTxt = false', async function () {
server.setContent('/robots.txt', 'User-agent: *\nDisallow: /');
let requestskipped = 0;
crawler.on('requestskipped', () => { requestskipped += 1; });
let requestdisallowed = 0;
crawler.on('requestdisallowed', () => { requestdisallowed += 1; });
await crawler.queue({ url: INDEX_PAGE, obeyRobotsTxt: false });
await crawler.onIdle();
assert.equal(requestskipped, 0);
assert.equal(requestdisallowed, 0);
assert.equal(onSuccess.callCount, 1);
});

Expand Down

0 comments on commit 37ded4f

Please sign in to comment.