diff --git a/scraper/main.go b/scraper/main.go index d3069b9..81e7a3f 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -78,8 +78,8 @@ func init() { } dynamoTable = os.Getenv("DYNAMO_TABLE") - if scraperSiteFBaseURL == "" { - log.Fatal("Environment variable SCRAPER_SITEF_BASEURL must be set") + if dynamoTable == "" { + log.Fatal("Environment variable DYNAMO_TABLE must be set") } } @@ -120,6 +120,7 @@ func lookForNewJobs() { for range sites { <-doneChannel } + } func handler(ctx context.Context) error { diff --git a/scraper/sitea/sitea.go b/scraper/sitea/sitea.go index ac291e7..13b4a4c 100644 --- a/scraper/sitea/sitea.go +++ b/scraper/sitea/sitea.go @@ -7,6 +7,7 @@ import ( "scraper/job" "strconv" "strings" + "sync" "github.com/PuerkitoBio/goquery" ) @@ -65,21 +66,39 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) { } func ScanNewJobs(siteABaseUrl string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { + var wg sync.WaitGroup + jobsChan := make(chan []job.Job) + + fetchJobs := func(url string) { + defer wg.Done() + finished := false + page := 1 + for !finished && page <= 15 { + pageStr := strconv.Itoa(page) + url := url + "?page=" + pageStr + jobs := job.GetNewJobs(url, proxyUrl, siteAJobListParser) + jobsChan <- jobs + // No new jobs found were done + if len(jobs) == 0 { + finished = true + } + page++ + } + } + + wg.Add(2) + go fetchJobs(siteABaseUrl + "/jobs/remote/nationwide/dev-engineering") + // lat and lon is obfuscated / local hospital + go fetchJobs(siteABaseUrl + "/jobs/hybrid/office/dev-engineering?search=Software+Engineer&location=Englewood-CO-USA&longitude=-104.99350&latitude=39.65464&searcharea=25mi") + + go func() { + wg.Wait() + close(jobsChan) + }() + possibleJobs := []job.Job{} - finished := false - page := 1 - - for !finished || page > 15 { - currentJobCount := len(possibleJobs) - pageStr := strconv.Itoa(page) - url := siteABaseUrl + "/jobs/remote/nationwide/dev-engineering?page=" + pageStr - jobs := job.GetNewJobs(url, proxyUrl, siteAJobListParser) + for jobs := range jobsChan { possibleJobs = append(possibleJobs, jobs...) - // No new jobs found were done - if currentJobCount == len(possibleJobs) { - finished = true - } - page++ } log.Println(siteABaseUrl+" total jobs found", len(possibleJobs))