Skip to content

Commit

Permalink
Merge pull request #10 from austin1237/siteE
Browse files Browse the repository at this point in the history
siteE
  • Loading branch information
austin1237 authored Feb 11, 2024
2 parents 28800a9 + 4f39fbc commit c9501d1
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 2 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ jobs:
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}

- run: cd terraform && terraform plan
env:
Expand All @@ -76,6 +77,7 @@ jobs:
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}

- run: cd terraform && terraform apply -input=false -auto-approve
env:
Expand All @@ -86,4 +88,5 @@ jobs:
TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }}
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}
8 changes: 8 additions & 0 deletions scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"scraper/siteb"
"scraper/sitec"
"scraper/sited"
"scraper/sitee"

"github.com/aws/aws-lambda-go/lambda"
)
Expand All @@ -27,6 +28,7 @@ var (
scraperSiteBBaseURL string
scraperSiteCBaseURL string
scraperSiteDBaseURL string
scraperSiteEBaseURL string
)

func init() {
Expand Down Expand Up @@ -60,6 +62,11 @@ func init() {
log.Fatal("Environment variable SCRAPER_SITED_BASEURL must be set")
}

scraperSiteEBaseURL = os.Getenv("SCRAPER_SITEE_BASEURL")
if scraperSiteEBaseURL == "" {
log.Fatal("Environment variable SCRAPER_SITEE_BASEURL must be set")
}

}

func lookForNewJobs() {
Expand All @@ -68,6 +75,7 @@ func lookForNewJobs() {
{ScanNewJobs: siteb.ScanNewJobs, BaseURL: scraperSiteBBaseURL},
{ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL},
{ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL},
{ScanNewJobs: sitee.ScanNewJobs, BaseURL: scraperSiteEBaseURL},
{ScanNewJobs: remotive.ScanNewJobs, BaseURL: "https://remotive.com"},
// Add more sites here
}
Expand Down
55 changes: 55 additions & 0 deletions scraper/sitee/sitee.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package sitee

import (
"log"
"scraper/interest"
"scraper/job"
"strings"

"github.com/PuerkitoBio/goquery"
)

func siteEJobListParser(baseURL string, doc *goquery.Document) []job.Job {
newJobs := []job.Job{}
doc.Find("li").Each(func(i int, s *goquery.Selection) {
recent := false
link, _ := s.Find("a").First().Attr("href")
company := strings.TrimSpace(s.Find("a").Eq(1).Text())
s.Find("span").Each(func(i int, span *goquery.Selection) {
text := strings.ToLower(span.Text())
if strings.Contains(text, "new job") {
recent = true
}
})
if recent {
newJob := job.Job{
Company: company,
Link: baseURL + link,
}
newJobs = append(newJobs, newJob)
}
})

return newJobs

}

func getSiteEJobInfo(jobUrl string, proxyUrl string) (string, error) {
doc, err := job.GetJobHtml(jobUrl, proxyUrl)
if err != nil {
return "", err
}
jobInfo := ""
doc.Find("div.mb-6.prose.break-words.prose-md.max-w-none").Each(func(i int, s *goquery.Selection) {
jobInfo += s.Find("*").Text() + " "
})
return jobInfo, nil
}

func ScanNewJobs(baseURL string, proxyURL string) []job.Job {
jobs := job.GetNewJobs(baseURL+"/category/development", proxyURL, siteEJobListParser)
log.Println(baseURL+" total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteEJobInfo)
log.Println(baseURL+" total interesting jobs found", len(interestingJobs))
return interestingJobs
}
3 changes: 2 additions & 1 deletion terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ module "scraper_lambda" {
"SCRAPER_SITEB_BASEURL" = "${var.SCRAPER_SITEB_BASEURL}"
"SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}"
"SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}"
"SCRAPER_SITEE_BASEURL" = "${var.SCRAPER_SITEE_BASEURL}"
}
}

Expand All @@ -51,7 +52,7 @@ module "headless_lambda" {
name = "headless-${terraform.workspace}"
memory_size = 2048
timeout = 30
image_uri = "${var.AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/headless@${var.DOCKER_IMAGE_SHA}" # Replace with your Docker image URI
image_uri = "${var.AWS_ACCOUNT_ID}.dkr.ecr.us-east-1.amazonaws.com/headless@${var.DOCKER_IMAGE_SHA}"
package_type = "Image"
env_vars = {}
}
Expand Down
4 changes: 4 additions & 0 deletions terraform/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ variable "SCRAPER_SITED_BASEURL" {
sensitive = true
}

variable "SCRAPER_SITEE_BASEURL" {
sensitive = true
}

variable "AWS_ACCOUNT_ID" {
sensitive = true
}
Expand Down

0 comments on commit c9501d1

Please sign in to comment.