Skip to content

Commit e1f643d

Browse files
authored
feat(crawler): fixing crawl functionality (#5)
## Motive this MR to fix issue with the crawling functionallity of the bot. the issue was with default fetcher that was using the `http.Client` that and it will block if the request takes too long to respond. the fix was to use `http.Client` with a timeout and a custom transport that will limit the number of concurrent requests. ## Changes - removed `OnError` Method. - added zerolog logger. - refactor metrics monitoring and add new metrics - reverted to simple queue. - timeout to HTTP client. - update gitignore.
1 parent 093db74 commit e1f643d

33 files changed

+1360
-1001
lines changed

.gitignore

+5-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
cmd/tests
2-
makefile
3-
logs
4-
tmp
1+
bin/
2+
tests/
3+
.idea.md
4+
*.*prof
5+
.vscode/

Makefile

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
clean:
2+
start:
3+
init:
4+
build:
5+
usage:
6+
.PHONY: clean start init build usage

README.md

+54-9
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,67 @@
1-
## WBot
1+
# WBot
22

33
A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages.
44

5-
### Features:
5+
## Features
6+
67
- Clean minimal API.
78
- Configurable: MaxDepth, MaxBodySize, Rate Limit, Parrallelism, User Agent & Proxy rotation.
89
- Memory-efficient, thread-safe.
910
- Provides built-in interface: Fetcher, Store, Queue & a Logger.
1011

12+
## API
13+
14+
WBot provides a minimal API for crawling web pages.
15+
16+
```go
17+
Run(links ...string) error
18+
OnReponse(fn func(*wbot.Response))
19+
Metrics() map[string]int64
20+
Shutdown()
21+
```
22+
23+
## Usage
24+
25+
```go
26+
package main
27+
28+
import (
29+
"fmt"
30+
"log"
1131

12-
### [Examples & API](https://github.com/twiny/wbot/wiki)
32+
"github.com/rs/zerolog"
33+
"github.com/twiny/wbot"
34+
"github.com/twiny/wbot/crawler"
35+
)
1336

14-
### TODO
15-
- [ ] Add support for robots.txt.
16-
- [ ] Add test cases.
17-
- [ ] Implement `Fetch` using Chromedp.
18-
- [ ] Add more examples.
19-
- [ ] Add documentation.
37+
func main() {
38+
bot := crawler.New(
39+
crawler.WithParallel(50),
40+
crawler.WithMaxDepth(5),
41+
crawler.WithRateLimit(&wbot.RateLimit{
42+
Hostname: "*",
43+
Rate: "10/1s",
44+
}),
45+
crawler.WithLogLevel(zerolog.DebugLevel),
46+
)
47+
defer bot.Shutdown()
48+
49+
// read responses
50+
bot.OnReponse(func(resp *wbot.Response) {
51+
fmt.Printf("crawled: %s\n", resp.URL.String())
52+
})
53+
54+
if err := bot.Run(
55+
"https://crawler-test.com/",
56+
); err != nil {
57+
log.Fatal(err)
58+
}
59+
60+
log.Printf("finished crawling\n")
61+
}
62+
63+
```
2064

2165
### Bugs
66+
2267
Bugs or suggestions? Please visit the [issue tracker](https://github.com/twiny/wbot/issues).

config.go

-10
This file was deleted.

crawler/config.go

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
package crawler
2+
3+
import (
4+
"runtime"
5+
"time"
6+
7+
"github.com/twiny/poxa"
8+
)
9+
10+
const (
11+
defaultReferrer = "https://www.google.com/search"
12+
defaultUserAgent = "WBot/v0.2.0 (+https://github.com/twiny/wbot)"
13+
defaultTimeout = 10 * time.Second
14+
defaultMaxBodySize = int64(1024 * 1024 * 5) // 5MB
15+
)
16+
17+
type (
18+
config struct {
19+
parallel int
20+
maxDepth int32
21+
maxBodySize int64
22+
timeout time.Duration
23+
userAgents poxa.Spinner[string]
24+
referrers poxa.Spinner[string]
25+
proxies poxa.Spinner[string]
26+
}
27+
)
28+
29+
func newConfig(maxDepth int32, userAgents, referrers, proxies []string) *config {
30+
if maxDepth <= 0 {
31+
maxDepth = 10
32+
}
33+
34+
var conf = &config{
35+
parallel: runtime.NumCPU(),
36+
maxDepth: maxDepth,
37+
maxBodySize: defaultMaxBodySize,
38+
timeout: defaultTimeout,
39+
userAgents: poxa.NewSpinner(defaultUserAgent),
40+
referrers: poxa.NewSpinner(defaultReferrer),
41+
proxies: nil,
42+
}
43+
44+
if len(userAgents) > 0 {
45+
uaList := poxa.NewSpinner(userAgents...)
46+
if uaList != nil {
47+
conf.userAgents = uaList
48+
}
49+
}
50+
51+
if len(referrers) > 0 {
52+
refList := poxa.NewSpinner(referrers...)
53+
if refList != nil {
54+
conf.referrers = refList
55+
}
56+
}
57+
58+
if len(proxies) > 0 {
59+
proxyList := poxa.NewSpinner(proxies...)
60+
if proxyList != nil {
61+
conf.proxies = proxyList
62+
}
63+
}
64+
65+
return conf
66+
}

0 commit comments

Comments
 (0)