diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 1e640c8..1c65fcf 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -66,6 +66,7 @@ jobs: TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} + TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }} - run: cd terraform && terraform plan env: @@ -78,6 +79,7 @@ jobs: TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} + TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }} - run: cd terraform && terraform apply -input=false -auto-approve env: @@ -89,4 +91,5 @@ jobs: TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} - TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} \ No newline at end of file + TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} + TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }} \ No newline at end of file diff --git a/scraper/go.mod b/scraper/go.mod index 3c0f0cd..38afd82 100644 --- a/scraper/go.mod +++ b/scraper/go.mod @@ -12,6 +12,7 @@ require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/stretchr/objx v0.1.0 // indirect golang.org/x/net v0.7.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/scraper/go.sum b/scraper/go.sum index bc59652..d08582d 100644 --- a/scraper/go.sum +++ b/scraper/go.sum @@ -9,6 +9,7 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s= github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= diff --git a/scraper/job/job.go b/scraper/job/job.go index 4c2405b..67e6bfd 100644 --- a/scraper/job/job.go +++ b/scraper/job/job.go @@ -28,8 +28,14 @@ func DeduplicatedLinks(jobs []Job) []Job { return deduplicated } -func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) { - response, err := http.Get(proxyURL + "/proxy?url=" + siteUrl) +func GetJobHtml(siteUrl string, proxyURL string, optionalRoute ...string) (*goquery.Document, error) { + var route string + if len(optionalRoute) > 0 { + route = optionalRoute[0] + } else { + route = "proxy" // default mode + } + response, err := http.Get(proxyURL + "/" + route + "?url=" + siteUrl) if err != nil { log.Println(siteUrl+": Failed to get site", err) return nil, err @@ -55,14 +61,20 @@ func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) { type parser func(string, *goquery.Document) []Job -func GetNewJobs(siteUrl string, proxyURL string, jobParser parser) []Job { +func GetNewJobs(siteUrl string, proxyURL string, jobParser parser, optionalMode ...string) []Job { + var mode string + if len(optionalMode) > 0 { + mode = optionalMode[0] + } else { + mode = "proxy" // default mode + } u, err := url.Parse(siteUrl) baseURL := u.Scheme + "://" + u.Host if err != nil { log.Println("Failed to parse url", err) return []Job{} } - doc, err := GetJobHtml(siteUrl, proxyURL) + doc, err := GetJobHtml(siteUrl, proxyURL, mode) if err != nil { return []Job{} } diff --git a/scraper/main.go b/scraper/main.go index 80cab63..c25c31e 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -12,6 +12,7 @@ import ( "scraper/sitec" "scraper/sited" "scraper/sitee" + "scraper/sitef" "github.com/aws/aws-lambda-go/lambda" ) @@ -29,6 +30,7 @@ var ( scraperSiteCBaseURL string scraperSiteDBaseURL string scraperSiteEBaseURL string + scraperSiteFBaseURL string ) func init() { @@ -67,6 +69,11 @@ func init() { log.Fatal("Environment variable SCRAPER_SITEE_BASEURL must be set") } + scraperSiteFBaseURL = os.Getenv("SCRAPER_SITEF_BASEURL") + if scraperSiteFBaseURL == "" { + log.Fatal("Environment variable SCRAPER_SITEF_BASEURL must be set") + } + } func lookForNewJobs() { @@ -76,6 +83,7 @@ func lookForNewJobs() { {ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL}, {ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL}, {ScanNewJobs: sitee.ScanNewJobs, BaseURL: scraperSiteEBaseURL}, + {ScanNewJobs: sitef.ScanNewJobs, BaseURL: scraperSiteFBaseURL}, {ScanNewJobs: remotive.ScanNewJobs, BaseURL: "https://remotive.com"}, // Add more sites here } diff --git a/scraper/sitef/sitef.go b/scraper/sitef/sitef.go new file mode 100644 index 0000000..9800d75 --- /dev/null +++ b/scraper/sitef/sitef.go @@ -0,0 +1,49 @@ +package sitef + +import ( + "log" + "scraper/interest" + "scraper/job" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func siteFJobListParser(baseURL string, doc *goquery.Document) []job.Job { + newJobs := []job.Job{} + doc.Find("div.job-wrapper").Each(func(i int, s *goquery.Selection) { + link, _ := s.Find("h4 a").Attr("href") + company := strings.TrimSpace(s.Find("div.company a").First().Text()) + timePosted := strings.TrimSpace(s.Find("div.date").Last().Text()) + if strings.Contains(timePosted, "hour") || strings.Contains(timePosted, "minute") { + newJob := job.Job{ + Company: company, + Link: baseURL + link, + } + newJobs = append(newJobs, newJob) + } + }) + + return newJobs +} + +func getSiteFJobInfo(jobUrl string, proxyUrl string) (string, error) { + doc, err := job.GetJobHtml(jobUrl, proxyUrl) + if err != nil { + return "", err + } + text := "" + doc.Find("div.job").Each(func(i int, s *goquery.Selection) { + text += s.Text() + " " + }) + return text, nil +} + +func ScanNewJobs(baseURL string, proxyURL string) []job.Job { + subUrl := "/jobs?category=development&location=north-america&positionType=full-time" + jobs := job.GetNewJobs(baseURL+subUrl, proxyURL, siteFJobListParser, "headless") + log.Println(baseURL+" total jobs found", len(jobs)) + interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteFJobInfo) + log.Println(baseURL+" total interesting jobs found", len(interestingJobs)) + return interestingJobs +} diff --git a/terraform/main.tf b/terraform/main.tf index f1e7f83..c9dd29a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -34,6 +34,7 @@ module "scraper_lambda" { "SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}" "SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}" "SCRAPER_SITEE_BASEURL" = "${var.SCRAPER_SITEE_BASEURL}" + "SCRAPER_SITEF_BASEURL" = "${var.SCRAPER_SITEF_BASEURL}" } } diff --git a/terraform/vars.tf b/terraform/vars.tf index 3229dba..8646b53 100644 --- a/terraform/vars.tf +++ b/terraform/vars.tf @@ -27,6 +27,10 @@ variable "SCRAPER_SITEE_BASEURL" { sensitive = true } +variable "SCRAPER_SITEF_BASEURL" { + sensitive = true +} + variable "AWS_ACCOUNT_ID" { sensitive = true }