Skip to content

Commit

Permalink
Merge pull request #12 from austin1237/sitef
Browse files Browse the repository at this point in the history
siteF
  • Loading branch information
austin1237 authored Feb 22, 2024
2 parents 0e64cd6 + 7223c20 commit dabdce3
Show file tree
Hide file tree
Showing 8 changed files with 84 additions and 5 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ jobs:
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}
TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }}

- run: cd terraform && terraform plan
env:
Expand All @@ -78,6 +79,7 @@ jobs:
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}
TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }}

- run: cd terraform && terraform apply -input=false -auto-approve
env:
Expand All @@ -89,4 +91,5 @@ jobs:
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}
TF_VAR_SCRAPER_SITEE_BASEURL: ${{ secrets.SCRAPER_SITEE_BASEURL }}
TF_VAR_SCRAPER_SITEF_BASEURL: ${{ secrets.SCRAPER_SITEF_BASEURL }}
1 change: 1 addition & 0 deletions scraper/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/andybalholm/cascadia v1.3.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/stretchr/objx v0.1.0 // indirect
golang.org/x/net v0.7.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
)
1 change: 1 addition & 0 deletions scraper/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.2 h1:4jaiDzPyXQvSd7D0EjG45355tLlV3VOECpq10pLC+8s=
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
Expand Down
20 changes: 16 additions & 4 deletions scraper/job/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@ func DeduplicatedLinks(jobs []Job) []Job {
return deduplicated
}

func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) {
response, err := http.Get(proxyURL + "/proxy?url=" + siteUrl)
func GetJobHtml(siteUrl string, proxyURL string, optionalRoute ...string) (*goquery.Document, error) {
var route string
if len(optionalRoute) > 0 {
route = optionalRoute[0]
} else {
route = "proxy" // default mode
}
response, err := http.Get(proxyURL + "/" + route + "?url=" + siteUrl)
if err != nil {
log.Println(siteUrl+": Failed to get site", err)
return nil, err
Expand All @@ -55,14 +61,20 @@ func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) {

type parser func(string, *goquery.Document) []Job

func GetNewJobs(siteUrl string, proxyURL string, jobParser parser) []Job {
func GetNewJobs(siteUrl string, proxyURL string, jobParser parser, optionalMode ...string) []Job {
var mode string
if len(optionalMode) > 0 {
mode = optionalMode[0]
} else {
mode = "proxy" // default mode
}
u, err := url.Parse(siteUrl)
baseURL := u.Scheme + "://" + u.Host
if err != nil {
log.Println("Failed to parse url", err)
return []Job{}
}
doc, err := GetJobHtml(siteUrl, proxyURL)
doc, err := GetJobHtml(siteUrl, proxyURL, mode)
if err != nil {
return []Job{}
}
Expand Down
8 changes: 8 additions & 0 deletions scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"scraper/sitec"
"scraper/sited"
"scraper/sitee"
"scraper/sitef"

"github.com/aws/aws-lambda-go/lambda"
)
Expand All @@ -29,6 +30,7 @@ var (
scraperSiteCBaseURL string
scraperSiteDBaseURL string
scraperSiteEBaseURL string
scraperSiteFBaseURL string
)

func init() {
Expand Down Expand Up @@ -67,6 +69,11 @@ func init() {
log.Fatal("Environment variable SCRAPER_SITEE_BASEURL must be set")
}

scraperSiteFBaseURL = os.Getenv("SCRAPER_SITEF_BASEURL")
if scraperSiteFBaseURL == "" {
log.Fatal("Environment variable SCRAPER_SITEF_BASEURL must be set")
}

}

func lookForNewJobs() {
Expand All @@ -76,6 +83,7 @@ func lookForNewJobs() {
{ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL},
{ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL},
{ScanNewJobs: sitee.ScanNewJobs, BaseURL: scraperSiteEBaseURL},
{ScanNewJobs: sitef.ScanNewJobs, BaseURL: scraperSiteFBaseURL},
{ScanNewJobs: remotive.ScanNewJobs, BaseURL: "https://remotive.com"},
// Add more sites here
}
Expand Down
49 changes: 49 additions & 0 deletions scraper/sitef/sitef.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package sitef

import (
"log"
"scraper/interest"
"scraper/job"
"strings"

"github.com/PuerkitoBio/goquery"
)

func siteFJobListParser(baseURL string, doc *goquery.Document) []job.Job {
newJobs := []job.Job{}
doc.Find("div.job-wrapper").Each(func(i int, s *goquery.Selection) {
link, _ := s.Find("h4 a").Attr("href")
company := strings.TrimSpace(s.Find("div.company a").First().Text())
timePosted := strings.TrimSpace(s.Find("div.date").Last().Text())
if strings.Contains(timePosted, "hour") || strings.Contains(timePosted, "minute") {
newJob := job.Job{
Company: company,
Link: baseURL + link,
}
newJobs = append(newJobs, newJob)
}
})

return newJobs
}

func getSiteFJobInfo(jobUrl string, proxyUrl string) (string, error) {
doc, err := job.GetJobHtml(jobUrl, proxyUrl)
if err != nil {
return "", err
}
text := ""
doc.Find("div.job").Each(func(i int, s *goquery.Selection) {
text += s.Text() + " "
})
return text, nil
}

func ScanNewJobs(baseURL string, proxyURL string) []job.Job {
subUrl := "/jobs?category=development&location=north-america&positionType=full-time"
jobs := job.GetNewJobs(baseURL+subUrl, proxyURL, siteFJobListParser, "headless")
log.Println(baseURL+" total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteFJobInfo)
log.Println(baseURL+" total interesting jobs found", len(interestingJobs))
return interestingJobs
}
1 change: 1 addition & 0 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ module "scraper_lambda" {
"SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}"
"SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}"
"SCRAPER_SITEE_BASEURL" = "${var.SCRAPER_SITEE_BASEURL}"
"SCRAPER_SITEF_BASEURL" = "${var.SCRAPER_SITEF_BASEURL}"
}
}

Expand Down
4 changes: 4 additions & 0 deletions terraform/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ variable "SCRAPER_SITEE_BASEURL" {
sensitive = true
}

variable "SCRAPER_SITEF_BASEURL" {
sensitive = true
}

variable "AWS_ACCOUNT_ID" {
sensitive = true
}
Expand Down

0 comments on commit dabdce3

Please sign in to comment.