-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
152 lines (124 loc) · 5 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"use strict";
/** Import all required node module */
var fs = require('fs'); // Node module for file system
var request = require('request'); // Node module for making HTTP request
var cheerio = require('cheerio');
/**
Cheerio implements a subset of core jQuery that allow us to traverse the DOM using jQuery familiar syntax.
This module is actively maintained and is fast and flexible, this is the reason I chose this module.
*/
var json2csv = require('json2csv');
/**
Converts json into csv with column titles and proper line endings.
I chose this package because it has many releases and is actively maintained, and also very popular.
*/
/** csv header fields */
var csvFields = ["Title", "Price", "ImageURL", "URL", "Time"];
var data = [],
shirtArray = [],
shirtToScrap = [];
var dir = "./data",
errorFile = 'scraper-error.log',
url = "http://shirts4mike.com";
/** Make request to index page to get all shirt links */
request(url, function(error, response, body ) {
/** Request is succesful and there is no error */
if(!error && response.statusCode === 200) {
/** Create jQuery like object */
var $ = cheerio.load(body);
/** Grab all shirt links from the page */
var shirts = $("a[href*='shirt']");
/** Traverse all links and store path into shirtArray */
shirts.each(function() {
var fullPath = url + '/' + $(this).attr('href');
if(shirtArray.indexOf(fullPath) === -1) {
shirtArray.push(fullPath);
}
}); // End of shirts.each() method
/** Loop through shirtArray and get all shirt Links */
for (var i = 0; i < shirtArray.length; i++) {
/** Store all links to shirtsToScrap array which has query string id */
if (shirtArray[i].indexOf("?id=") > 0) {
shirtToScrap.push(shirtArray[i]);
} else {
/** Make request to "http://shirts4mike.com/shirt.php" page to get additional shirt links */
request(shirtArray[i], function(error, response, body) {
/** Request is succesful and there is no error */
if(!error && response.statusCode === 200) {
/** Create jQuery like object */
var $ = cheerio.load(body);
/** Grab all shirt links from the page which has id */
var shirts = $("a[href*='shirt.php?id=']");
/** Traverse all links and store path into shirtArray */
shirts.each(function() {
var href = $(this).attr('href');
var fullPath = url + '/' + href;
if (shirtToScrap.indexOf(fullPath) === -1) {
shirtToScrap.push(fullPath);
}
}); // End of shirts.each() method
/** Now we have all shirt links in shirtToScrp array, make request to all links
* and get shirt price, image, title, etc
*/
for (var i = 0; i < shirtToScrap.length; i++) {
/** Request to get shirt details */
request(shirtToScrap[i], function(error, response, body) {
/** Request is succesful and there is no error */
if (!error && response.statusCode == 200) {
/** Create jQuery like object */
var $ = cheerio.load(body);
/** Create an object to hold the shirt detail */
var json = {}
json.Title = $('title').text();
json.Price = $('.price').text();
json.ImageURL = $('.shirt-picture img').attr('src');
json.URL = response.request.href;
var today = new Date();
json.Time = today; // Time of extraction
/** Store shirt details into an array */
data.push(json);
/** Create folder called 'data' if it is not already exists */
if(!fs.existsSync(dir)) {
fs.mkdirSync(dir);
};
/** Create csv file with today's file name */
var dd = today.getDate();
var mm = today.getMonth() + 1 ;
var yyyy = today.getFullYear();
var csvFileName = yyyy + "-" + dd + "-" + mm + ".csv";
/** Convert json data into csv format using node module json2csv */
json2csv({data:data, fields:csvFields}, function(err, csv) {
if (err) throw err;
/** If the data file for today already exists it should overwrite the file */
fs.writeFile(dir + "/" + csvFileName, csv, function(err) {
if (err) throw err;
console.log(csvFileName + ' created');
}); //End fo writeFile
}); // End of json2csv method
} else {
printErrorMessage(error);
} // End of if - request succesful
}); // End of request method
} // end of for
} else {
printErrorMessage(error);
} // End of if
}); // End of request method
} // End if
} // End of for loop
} else {
printErrorMessage(error);
} // End if
}); // End of request method
/**
* @description Error Handling function
* @param {object} error
*/
function printErrorMessage(error) {
console.log('Error occured while scrapping site ' + url);
var errorMsg = "[" + Date() + "]" + " : " + error + "\n";
fs.appendFile(errorFile, errorMsg, function(err) {
if (err) throw err;
console.log('Error was logged into "scraper-error.log" file');
});
}