diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 675deaf..1e640c8 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -65,6 +65,7 @@ jobs: TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} + TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} - run: cd terraform && terraform plan env: @@ -76,6 +77,7 @@ jobs: TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} + TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} - run: cd terraform && terraform apply -input=false -auto-approve env: @@ -86,4 +88,5 @@ jobs: TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }} TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }} TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }} - TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} \ No newline at end of file + TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }} + TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }} \ No newline at end of file diff --git a/scraper/main.go b/scraper/main.go index c252f28..80cab63 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -11,6 +11,7 @@ import ( "scraper/siteb" "scraper/sitec" "scraper/sited" + "scraper/sitee" "github.com/aws/aws-lambda-go/lambda" ) @@ -27,6 +28,7 @@ var ( scraperSiteBBaseURL string scraperSiteCBaseURL string scraperSiteDBaseURL string + scraperSiteEBaseURL string ) func init() { @@ -60,6 +62,11 @@ func init() { log.Fatal("Environment variable SCRAPER_SITED_BASEURL must be set") } + scraperSiteEBaseURL = os.Getenv("SCRAPER_SITEE_BASEURL") + if scraperSiteEBaseURL == "" { + log.Fatal("Environment variable SCRAPER_SITEE_BASEURL must be set") + } + } func lookForNewJobs() { @@ -68,6 +75,7 @@ func lookForNewJobs() { {ScanNewJobs: siteb.ScanNewJobs, BaseURL: scraperSiteBBaseURL}, {ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL}, {ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL}, + {ScanNewJobs: sitee.ScanNewJobs, BaseURL: scraperSiteEBaseURL}, {ScanNewJobs: remotive.ScanNewJobs, BaseURL: "https://remotive.com"}, // Add more sites here } diff --git a/scraper/sitee/sitee.go b/scraper/sitee/sitee.go new file mode 100644 index 0000000..685f839 --- /dev/null +++ b/scraper/sitee/sitee.go @@ -0,0 +1,55 @@ +package sitee + +import ( + "log" + "scraper/interest" + "scraper/job" + "strings" + + "github.com/PuerkitoBio/goquery" +) + +func siteEJobListParser(baseURL string, doc *goquery.Document) []job.Job { + newJobs := []job.Job{} + doc.Find("li").Each(func(i int, s *goquery.Selection) { + recent := false + link, _ := s.Find("a").First().Attr("href") + company := strings.TrimSpace(s.Find("a").Eq(1).Text()) + s.Find("span").Each(func(i int, span *goquery.Selection) { + text := strings.ToLower(span.Text()) + if strings.Contains(text, "new job") { + recent = true + } + }) + if recent { + newJob := job.Job{ + Company: company, + Link: baseURL + link, + } + newJobs = append(newJobs, newJob) + } + }) + + return newJobs + +} + +func getSiteEJobInfo(jobUrl string, proxyUrl string) (string, error) { + doc, err := job.GetJobHtml(jobUrl, proxyUrl) + if err != nil { + return "", err + } + jobInfo := "" + doc.Find("div.mb-6.prose.break-words.prose-md.max-w-none").Each(func(i int, s *goquery.Selection) { + jobInfo += s.Find("*").Text() + " " + }) + return jobInfo, nil +} + +func ScanNewJobs(baseURL string, proxyURL string) []job.Job { + jobs := job.GetNewJobs(baseURL+"/category/development", proxyURL, siteEJobListParser) + log.Println(baseURL+" total jobs found", len(jobs)) + interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteEJobInfo) + log.Println(baseURL+" total interesting jobs found", len(interestingJobs)) + return interestingJobs +} diff --git a/terraform/main.tf b/terraform/main.tf index a6dd4ea..f1e7f83 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -33,6 +33,7 @@ module "scraper_lambda" { "SCRAPER_SITEB_BASEURL" = "${var.SCRAPER_SITEB_BASEURL}" "SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}" "SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}" + "SCRAPER_SITEE_BASEURL" = "${var.SCRAPER_SITEE_BASEURL}" } } @@ -51,7 +52,7 @@ module "headless_lambda" { name = "headless-${terraform.workspace}" memory_size = 2048 timeout = 30 - image_uri = "${var.AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/headless@${var.DOCKER_IMAGE_SHA}" # Replace with your Docker image URI + image_uri = "${var.AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/headless@${var.DOCKER_IMAGE_SHA}" package_type = "Image" env_vars = {} } diff --git a/terraform/vars.tf b/terraform/vars.tf index 7022aa7..3229dba 100644 --- a/terraform/vars.tf +++ b/terraform/vars.tf @@ -23,6 +23,10 @@ variable "SCRAPER_SITED_BASEURL" { sensitive = true } +variable "SCRAPER_SITEE_BASEURL" { + sensitive = true +} + variable "AWS_ACCOUNT_ID" { sensitive = true }