Skip to content

Commit

Permalink
Add FastMode to fetch results much faster (#102)
Browse files Browse the repository at this point in the history
Utilizing scrapemate's stealth mode to try to fetch directly the results
without JS rendering. This version is much faster.

However the version is still in BETA.
Results are not extracted in full and there is the possibility of the
client being blocked
  • Loading branch information
gosom authored Dec 15, 2024
1 parent 1015b4c commit 00bb13a
Show file tree
Hide file tree
Showing 18 changed files with 586 additions and 62 deletions.
30 changes: 23 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@
---
## Try it



A command line and web based google maps scraper build using

[scrapemate](https://github.com/gosom/scrapemate) web crawling framework.
Expand All @@ -53,11 +51,8 @@ customize it to your needs

![Example GIF](img/example.gif)


### Web UI:



```
mkdir -p gmapsdata && docker run -v $PWD/gmapsdata:/gmapsdata -p 8080:8080 gosom/google-maps-scraper -data-folder /gmapsdata
```
Expand Down Expand Up @@ -98,6 +93,7 @@ Your support helps ensure continued improvement and maintenance.
- Optionally extracts emails from the website of the business
- SOCKS5/HTTP/HTTPS proxy support
- Serverless execution via AWS Lambda functions (experimental & no documentation yet)
- Fast Mode (BETA)

## Notes on email extraction

Expand All @@ -113,6 +109,22 @@ For the moment it only checks only one page of the website (the one that is regi
Keep in mind that enabling email extraction results to larger processing time, since more
pages are scraped.

## Fast Mode

Fast mode returns you at most 21 search results per query ordered by distance from the **latitude** and **longitude** provided.
All the results are within the specificied **radius**

It does not contain all the data points but basic ones.
However it provides the ability to extract data really fast.

When you use the fast mode ensure that you have provided:
- zoom
- radius (in meters)
- latitude
- longitude


**Fast mode is Beta, you may experience blocking**

## Extracted Data Points

Expand Down Expand Up @@ -195,7 +207,6 @@ The results are written when they arrive in the `results` file you specified
### Command line options

try `./google-maps-scraper -h` to see the command line options available:

```
-aws-access-key string
AWS access key
Expand Down Expand Up @@ -225,6 +236,8 @@ try `./google-maps-scraper -h` to see the command line options available:
extract emails from websites
-exit-on-inactivity duration
exit after inactivity duration (e.g., '5m')
-fast-mode
fast mode (reduced data collection)
-function-name string
AWS Lambda function name
-geo string
Expand All @@ -239,6 +252,8 @@ try `./google-maps-scraper -h` to see the command line options available:
produce seed jobs only (requires dsn)
-proxies string
comma separated list of proxies to use in the format protocol://user:pass@host:port example: socks5://localhost:9050 or http://user:pass@localhost:9050
-radius float
search radius in meters. Default is 10000 meters (default 10000)
-results string
path to the results file [default: stdout] (default "stdout")
-s3-bucket string
Expand All @@ -248,7 +263,7 @@ try `./google-maps-scraper -h` to see the command line options available:
-writer string
use custom writer plugin (format: 'dir:pluginName')
-zoom int
set zoom level (0-21) for search
set zoom level (0-21) for search (default 15)
```

## Using a custom writer
Expand Down Expand Up @@ -445,3 +460,4 @@ banner is generated using OpenAI's DALE


If you register via the links on my page I may get a commission. This is another way to support my work

71 changes: 71 additions & 0 deletions gmaps/entry.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@ package gmaps
import (
"encoding/json"
"fmt"
"iter"
"math"
"runtime/debug"
"slices"
"strconv"
"strings"
)
Expand Down Expand Up @@ -91,6 +94,33 @@ type Entry struct {
Emails []string `json:"emails"`
}

func (e *Entry) haversineDistance(lat, lon float64) float64 {
const R = 6371e3 // earth radius in meters

clat := lat * math.Pi / 180
clon := lon * math.Pi / 180

elat := e.Latitude * math.Pi / 180
elon := e.Longtitude * math.Pi / 180

dlat := elat - clat
dlon := elon - clon

a := math.Sin(dlat/2)*math.Sin(dlat/2) +
math.Cos(clat)*math.Cos(elat)*
math.Sin(dlon/2)*math.Sin(dlon/2)

c := 2 * math.Atan2(math.Sqrt(a), math.Sqrt(1-a))

return R * c
}

func (e *Entry) isWithinRadius(lat, lon, radius float64) bool {
distance := e.haversineDistance(lat, lon)

return distance <= radius
}

func (e *Entry) IsWebsiteValidForEmail() bool {
if e.WebSite == "" {
return false
Expand Down Expand Up @@ -555,3 +585,44 @@ func decodeURL(url string) (string, error) {

return unquoted, nil
}

type EntryWithDistance struct {
Entry *Entry
Distance float64
}

func filterAndSortEntriesWithinRadius(entries []*Entry, lat, lon, radius float64) []*Entry {
withinRadiusIterator := func(yield func(EntryWithDistance) bool) {
for _, entry := range entries {
distance := entry.haversineDistance(lat, lon)
if distance <= radius {
if !yield(EntryWithDistance{Entry: entry, Distance: distance}) {
return
}
}
}
}

entriesWithDistance := slices.Collect(iter.Seq[EntryWithDistance](withinRadiusIterator))

slices.SortFunc(entriesWithDistance, func(a, b EntryWithDistance) int {
switch {
case a.Distance < b.Distance:
return -1
case a.Distance > b.Distance:
return 1
default:
return 0
}
})

resultIterator := func(yield func(*Entry) bool) {
for _, e := range entriesWithDistance {
if !yield(e.Entry) {
return
}
}
}

return slices.Collect(iter.Seq[*Entry](resultIterator))
}
16 changes: 16 additions & 0 deletions gmaps/entry_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package gmaps_test

import (
"fmt"
"os"
"testing"

Expand Down Expand Up @@ -200,3 +201,18 @@ func Test_EntryFromJSONRaw2(t *testing.T) {
require.NoError(t, err)
require.Greater(t, len(entry.About), 0)
}

func Test_EntryFromJsonC(t *testing.T) {
raw, err := os.ReadFile("../testdata/output.json")

require.NoError(t, err)
require.NotEmpty(t, raw)

entries, err := gmaps.ParseSearchResults(raw)

require.NoError(t, err)

for _, entry := range entries {
fmt.Printf("%+v\n", entry)
}
}
90 changes: 90 additions & 0 deletions gmaps/multiple.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package gmaps

import (
"encoding/json"
"fmt"
"strings"

olc "github.com/google/open-location-code/go"
)

func ParseSearchResults(raw []byte) ([]*Entry, error) {
var data []any
if err := json.Unmarshal(raw, &data); err != nil {
return nil, fmt.Errorf("failed to unmarshal JSON: %w", err)
}

if len(data) == 0 {
return nil, fmt.Errorf("empty JSON data")
}

container, ok := data[0].([]any)
if !ok || len(container) == 0 {
return nil, fmt.Errorf("invalid business list structure")
}

items := getNthElementAndCast[[]any](container, 1)
if len(items) < 2 {
return nil, fmt.Errorf("empty business list")
}

entries := make([]*Entry, 0, len(items)-1)

for i := 1; i < len(items); i++ {
arr, ok := items[i].([]any)
if !ok {
continue
}

business := getNthElementAndCast[[]any](arr, 14)

var entry Entry

entry.ID = getNthElementAndCast[string](business, 0)
entry.Title = getNthElementAndCast[string](business, 11)
entry.Categories = toStringSlice(getNthElementAndCast[[]any](business, 13))
entry.WebSite = getNthElementAndCast[string](business, 7, 0)

entry.ReviewRating = getNthElementAndCast[float64](business, 4, 7)
entry.ReviewCount = int(getNthElementAndCast[float64](business, 4, 8))

fullAddress := getNthElementAndCast[[]any](business, 2)

entry.Address = func() string {
sb := strings.Builder{}

for i, part := range fullAddress {
if i > 0 {
sb.WriteString(", ")
}

sb.WriteString(fmt.Sprintf("%v", part))
}

return sb.String()
}()

entry.Latitude = getNthElementAndCast[float64](business, 9, 2)
entry.Longtitude = getNthElementAndCast[float64](business, 9, 3)
entry.Phone = strings.ReplaceAll(getNthElementAndCast[string](business, 178, 0, 0), " ", "")
entry.OpenHours = getHours(business)
entry.Status = getNthElementAndCast[string](business, 34, 4, 4)
entry.Timezone = getNthElementAndCast[string](business, 30)
entry.DataID = getNthElementAndCast[string](business, 10)

entry.PlusCode = olc.Encode(entry.Latitude, entry.Longtitude, 10)

entries = append(entries, &entry)
}

return entries, nil
}

func toStringSlice(arr []any) []string {
ans := make([]string, 0, len(arr))
for _, v := range arr {
ans = append(ans, fmt.Sprintf("%v", v))
}

return ans
}
Loading

0 comments on commit 00bb13a

Please sign in to comment.