diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 1355864..675deaf 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -64,6 +64,7 @@ jobs: TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }} TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} + TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} - run: cd terraform && terraform plan env: @@ -74,6 +75,7 @@ jobs: TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }} TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} + TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} - run: cd terraform && terraform apply -input=false -auto-approve env: @@ -83,4 +85,5 @@ jobs: TF_VAR_SCRAPER_WEBHOOK: ${{ secrets.SCRAPER_WEBHOOK }} TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }} TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} - TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} \ No newline at end of file + TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} + TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} \ No newline at end of file diff --git a/scraper/main.go b/scraper/main.go index 2bd4b57..11d9bd1 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -9,6 +9,7 @@ import ( "scraper/sitea" "scraper/siteb" "scraper/sitec" + "scraper/sited" "github.com/aws/aws-lambda-go/lambda" ) @@ -24,6 +25,7 @@ var ( scraperSiteABaseURL string scraperSiteBBaseURL string scraperSiteCBaseURL string + scraperSiteDBaseURL string ) func init() { @@ -49,7 +51,12 @@ func init() { scraperSiteCBaseURL = os.Getenv("SCRAPER_SITEC_BASEURL") if scraperSiteCBaseURL == "" { - log.Fatal("Environment variable SCRAPER_SITEB_BASEURL must be set") + log.Fatal("Environment variable SCRAPER_SITEC_BASEURL must be set") + } + + scraperSiteDBaseURL = os.Getenv("SCRAPER_SITED_BASEURL") + if scraperSiteDBaseURL == "" { + log.Fatal("Environment variable SCRAPER_SITED_BASEURL must be set") } } @@ -59,6 +66,7 @@ func lookForNewJobs() { {ScanNewJobs: sitea.ScanNewJobs, BaseURL: scraperSiteABaseURL}, {ScanNewJobs: siteb.ScanNewJobs, BaseURL: scraperSiteBBaseURL}, {ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL}, + {ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL}, // Add more sites here } diff --git a/scraper/sitea/sitea.go b/scraper/sitea/sitea.go index 0a5e327..f5b244e 100644 --- a/scraper/sitea/sitea.go +++ b/scraper/sitea/sitea.go @@ -2,7 +2,6 @@ package sitea import ( "errors" - "fmt" "log" "net/http" "scraper/interest" @@ -112,8 +111,8 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) { func ScanNewJobs(siteABaseUrl string, proxyUrl string) []job.Job { possibleJobs := scanSiteA(siteABaseUrl) - fmt.Println("siteA total jobs found", len(possibleJobs)) + log.Println("siteA total jobs found", len(possibleJobs)) interestingJobs := interest.FilterInterest(proxyUrl, possibleJobs, GetSiteAJobInfo) - fmt.Println("siteA interesting jobs found", len(interestingJobs)) + log.Println("siteA interesting jobs found", len(interestingJobs)) return interestingJobs } diff --git a/scraper/siteb/siteb.go b/scraper/siteb/siteb.go index 378f69f..6bc23c0 100644 --- a/scraper/siteb/siteb.go +++ b/scraper/siteb/siteb.go @@ -2,7 +2,6 @@ package siteb import ( "errors" - "fmt" "log" "net/http" "scraper/interest" @@ -94,8 +93,8 @@ func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) { func ScanNewJobs(sitebBaseUrl string, proxyUrl string) []job.Job { jobs := scanSiteB(sitebBaseUrl) - fmt.Println("siteB total jobs found", len(jobs)) + log.Println("siteB total jobs found", len(jobs)) interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteBJobInfo) - fmt.Println("siteB interesting jobs", len(interestingJobs)) + log.Println("siteB interesting jobs", len(interestingJobs)) return interestingJobs } diff --git a/scraper/sitec/sitec.go b/scraper/sitec/sitec.go index 710e4b0..e742d9b 100644 --- a/scraper/sitec/sitec.go +++ b/scraper/sitec/sitec.go @@ -2,7 +2,6 @@ package sitec import ( "errors" - "fmt" "log" "net/http" @@ -98,8 +97,8 @@ func ScanNewJobs(sitecBaseUrl string, proxyUrl string) []job.Job { } jobs = job.DeduplicatedLinks(jobs) - fmt.Println("siteC total jobs found", len(jobs)) + log.Println("siteC total jobs found", len(jobs)) interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteCJobInfo) - fmt.Println("siteC interesting jobs", len(interestingJobs)) + log.Println("siteC interesting jobs", len(interestingJobs)) return interestingJobs } diff --git a/scraper/sited/sited.go b/scraper/sited/sited.go new file mode 100644 index 0000000..4207abf --- /dev/null +++ b/scraper/sited/sited.go @@ -0,0 +1,86 @@ +package sited + +import ( + "errors" + "log" + "net/http" + "scraper/interest" + "scraper/job" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func scanSiteD(siteDBaseUrl string) []job.Job { + var newJobs = []job.Job{} + url := siteDBaseUrl + "/remote-jobs/developer/" + response, err := http.Get(url) + if err != nil { + log.Println("SiteD: Failed to get site", err) + return newJobs + } + defer response.Body.Close() + + if response.StatusCode != http.StatusOK { + log.Printf("SiteD: HTTP request failed with status: %s", response.Status) + return newJobs + } + + // Parse the HTML document using goquery + doc, err := goquery.NewDocumentFromReader(response.Body) + if err != nil { + log.Println("SiteD: Failed to parse site", err) + return newJobs + } + + doc.Find("a.card.m-0.border-left-0.border-right-0.border-top-0.border-bottom").Each(func(i int, s *goquery.Selection) { + jobURL, exists := s.Attr("href") + if exists { + jobTitle := s.Find(".font-weight-bold.larger").Text() + postTime := strings.TrimSpace(s.Find(".float-right.d-none.d-md-inline.text-secondary small").Text()) + companyInfo := strings.TrimSpace(s.Find("p.m-0.text-secondary").First().Text()) + company := strings.TrimSpace(strings.Split(companyInfo, "|")[0]) + + newJob := job.Job{ + Title: jobTitle, + Link: siteDBaseUrl + jobURL, + Company: company, + } + + if strings.Contains(postTime, "hours") { + newJobs = append(newJobs, newJob) + } + } + }) + + return newJobs +} + +func getSiteDJobInfo(jobUrl string, proxyUrl string) (string, error) { + response, err := http.Get(proxyUrl + "/proxy?url=" + jobUrl) + if err != nil { + return "", err + } + defer response.Body.Close() + + if response.StatusCode != http.StatusOK { + return "", errors.New("HTTP request failed with status: " + response.Status) + } + + // Parse the HTML document using goquery + doc, err := goquery.NewDocumentFromReader(response.Body) + if err != nil { + return "", err + } + + jobInfo := doc.Find("div.job_description").Text() + return jobInfo, nil +} + +func ScanNewJobs(siteDBaseUrl string, proxyUrl string) []job.Job { + jobs := scanSiteD(siteDBaseUrl) + log.Println("siteD total jobs found", len(jobs)) + interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteDJobInfo) + log.Println("siteD interesting jobs", len(interestingJobs)) + return interestingJobs +} diff --git a/terraform/main.tf b/terraform/main.tf index 599db41..a6dd4ea 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -32,6 +32,7 @@ module "scraper_lambda" { "SCRAPER_SITEA_BASEURL" = "${var.SCRAPER_SITEA_BASEURL}" "SCRAPER_SITEB_BASEURL" = "${var.SCRAPER_SITEB_BASEURL}" "SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}" + "SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}" } } diff --git a/terraform/vars.tf b/terraform/vars.tf index a51f79e..7022aa7 100644 --- a/terraform/vars.tf +++ b/terraform/vars.tf @@ -19,6 +19,10 @@ variable "SCRAPER_SITEC_BASEURL" { sensitive = true } +variable "SCRAPER_SITED_BASEURL" { + sensitive = true +} + variable "AWS_ACCOUNT_ID" { sensitive = true }