-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdoc_scaper_class.js
164 lines (143 loc) · 4.63 KB
/
doc_scaper_class.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
//this file is used to generate documentation only
/**
* Main Scraper class
* @example
* // npm install easy_web_crawler
* const Scaper = require('easy_web_crawler')
* var scraper =new Scraper();
*/
class Scaper {
constructor() { }
/**
* This is mandatory.<br>
* Take the list of urls used as the starting point.
* @param {(string|string[])} listOfURLs
* @example
* // add the urls as the starting point
* scaper.startWithURLs(['www.googl.com','www.bing.com'])
* scaper.startWithURLs('www.googl.com')
*/
startWithURLs(listOfURLs) { }
/**
* Takes a non async callback function as argument,url added to processing queue only if the function return true value.<br>
* This is optional.
* By default is accept all urls added to processing queue.<br>
* @param {function} nonAsyncFunction
* @example
* // accept url contains www.google.com
* scraper.allowIfMatches(function(url) {
* return url.indexOf('www.google.com')>-1
* })
*/
allowIfMatches(nonAsyncFunction) { }
/**
* This is optional setting.<br>
* This will save your progress in the file and you can stop and start the scraper from the previous state.<br>
* The file is a sqlite db file you can modify the content using sqllite clients.<br>
* If no file specified the stored in memory..
* @param {string} filePath
* @example
* // state stored in state.db file
* scraper.saveProgressInFile("./state.db")
*/
saveProgressInFile(filePath) { }
/**
* This will allow the scraper to automatically download all the links form the page and add to processing queue.<br>
* Note the urls will be filtered if allowIfMatches function return 'false'.
* @param {boolean} enableAutoCrawler - true to enable
* @example
* scraper.enableAutoCrawler(true)
*/
enableAutoCrawler(flag) { }
/**
* Time delay between each page load in milliseconds
* @param {number} [delayInMilliSeconds=0]
* @example
* //wait for 90 milliseconds between page load
* scraper.waitBetweenPageLoad(90)
*/
waitBetweenPageLoad(delayInMilliSeconds) { }
/**
* Final callback when scarping is completed
* @param {number} asyncFunction
* @example
* scraper.callbackOnFinish(function(result){
* console.log(result)
* })
*/
callbackOnFinish(asyncFunction) { }
/**
* This is the main function.Your scarping logic to be defined in the function.<br>
* This called for each page in the processing queue.<br>
* Called with pupetter page object as input.<br>
* The page object input got addtional methods to support scraping
* @see page
* @param {function} asyncFunction - a sync function with single input argument page.
* @example
* scraper.waitBetweenPageLoad(90)
*/
callbackOnPageLoad(asyncFunction) { }
/**
* To start the scraping process.
* callbackOnFinish function is called once the scraping is completed.
* @example
* scraper.start()
*/
start() { }
}
/**
* Pupetter page class.
* Enhanced with supporting function detailed below.
*
*/
class Page {
/**
* Download image from url and save to local disk
* @param {string} image_download_url
* @param {string} where_to_full_file_path
* @example
* scraper.callbackOnPageLoad(async function(page){
* var img = await page.$('img')
* var img_src = await page.evaluate(img => img.getAttribute("src"), img);
* page.download_image(img_src,"usr/test/profile.png")
* })
*/
download_image(image_download_url, where_to_full_file_path) {
}
/**
* Save the text result ,this will returned as input to callbackOnFinish function<br>
* Each url can store one result
* @param {string} text
* @example
* scraper.callbackOnPageLoad(async function(page){
* var article = await page.$eval('article', tag => tag.innerText);
* page.saveResult(article)
* })
*/
saveResult(text) {
}
/**
* Write text content to local file
* @param {string} content
* @param {string} filename
* @example
* scraper.callbackOnPageLoad(async function(page){
* var article = await page.$eval('article', tag => tag.innerText);
* page.download_image(article,"usr/test/article.txt")
* });
*/
write_text_to_file(content, filename) {
}
/**
* Add the url to processing queue
* @param {string} url
* @example
* scraper.callbackOnPageLoad(async function(page){
* var a = await page.$('a')
* var url = await page.evaluate(a => a.getAttribute("href"), a);
* page.add_url_to_queue(url)
* });
*/
add_url_to_queue(url) {
}
}