Skip to content

Commit

Permalink
Merge pull request #6 from austin1237/sited
Browse files Browse the repository at this point in the history
sited
  • Loading branch information
austin1237 authored Feb 6, 2024
2 parents fefb4f1 + 42893fc commit 864cd98
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 11 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }}
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}

- run: cd terraform && terraform plan
env:
Expand All @@ -74,6 +75,7 @@ jobs:
TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }}
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}

- run: cd terraform && terraform apply -input=false -auto-approve
env:
Expand All @@ -83,4 +85,5 @@ jobs:
TF_VAR_SCRAPER_WEBHOOK: ${{ secrets.SCRAPER_WEBHOOK }}
TF_VAR_SCRAPER_SITEA_BASEURL: ${{ secrets.SCRAPER_SITEA_BASEURL }}
TF_VAR_SCRAPER_SITEB_BASEURL: ${{ secrets.SCRAPER_SITEB_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITEC_BASEURL: ${{ secrets.SCRAPER_SITEC_BASEURL }}
TF_VAR_SCRAPER_SITED_BASEURL: ${{ secrets.SCRAPER_SITED_BASEURL }}
10 changes: 9 additions & 1 deletion scraper/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"scraper/sitea"
"scraper/siteb"
"scraper/sitec"
"scraper/sited"

"github.com/aws/aws-lambda-go/lambda"
)
Expand All @@ -24,6 +25,7 @@ var (
scraperSiteABaseURL string
scraperSiteBBaseURL string
scraperSiteCBaseURL string
scraperSiteDBaseURL string
)

func init() {
Expand All @@ -49,7 +51,12 @@ func init() {

scraperSiteCBaseURL = os.Getenv("SCRAPER_SITEC_BASEURL")
if scraperSiteCBaseURL == "" {
log.Fatal("Environment variable SCRAPER_SITEB_BASEURL must be set")
log.Fatal("Environment variable SCRAPER_SITEC_BASEURL must be set")
}

scraperSiteDBaseURL = os.Getenv("SCRAPER_SITED_BASEURL")
if scraperSiteDBaseURL == "" {
log.Fatal("Environment variable SCRAPER_SITED_BASEURL must be set")
}

}
Expand All @@ -59,6 +66,7 @@ func lookForNewJobs() {
{ScanNewJobs: sitea.ScanNewJobs, BaseURL: scraperSiteABaseURL},
{ScanNewJobs: siteb.ScanNewJobs, BaseURL: scraperSiteBBaseURL},
{ScanNewJobs: sitec.ScanNewJobs, BaseURL: scraperSiteCBaseURL},
{ScanNewJobs: sited.ScanNewJobs, BaseURL: scraperSiteDBaseURL},
// Add more sites here
}

Expand Down
5 changes: 2 additions & 3 deletions scraper/sitea/sitea.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package sitea

import (
"errors"
"fmt"
"log"
"net/http"
"scraper/interest"
Expand Down Expand Up @@ -112,8 +111,8 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) {

func ScanNewJobs(siteABaseUrl string, proxyUrl string) []job.Job {
possibleJobs := scanSiteA(siteABaseUrl)
fmt.Println("siteA total jobs found", len(possibleJobs))
log.Println("siteA total jobs found", len(possibleJobs))
interestingJobs := interest.FilterInterest(proxyUrl, possibleJobs, GetSiteAJobInfo)
fmt.Println("siteA interesting jobs found", len(interestingJobs))
log.Println("siteA interesting jobs found", len(interestingJobs))
return interestingJobs
}
5 changes: 2 additions & 3 deletions scraper/siteb/siteb.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package siteb

import (
"errors"
"fmt"
"log"
"net/http"
"scraper/interest"
Expand Down Expand Up @@ -94,8 +93,8 @@ func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) {

func ScanNewJobs(sitebBaseUrl string, proxyUrl string) []job.Job {
jobs := scanSiteB(sitebBaseUrl)
fmt.Println("siteB total jobs found", len(jobs))
log.Println("siteB total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteBJobInfo)
fmt.Println("siteB interesting jobs", len(interestingJobs))
log.Println("siteB interesting jobs", len(interestingJobs))
return interestingJobs
}
5 changes: 2 additions & 3 deletions scraper/sitec/sitec.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package sitec

import (
"errors"
"fmt"
"log"
"net/http"

Expand Down Expand Up @@ -98,8 +97,8 @@ func ScanNewJobs(sitecBaseUrl string, proxyUrl string) []job.Job {
}

jobs = job.DeduplicatedLinks(jobs)
fmt.Println("siteC total jobs found", len(jobs))
log.Println("siteC total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteCJobInfo)
fmt.Println("siteC interesting jobs", len(interestingJobs))
log.Println("siteC interesting jobs", len(interestingJobs))
return interestingJobs
}
86 changes: 86 additions & 0 deletions scraper/sited/sited.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package sited

import (
"errors"
"log"
"net/http"
"scraper/interest"
"scraper/job"
"strings"

"github.com/PuerkitoBio/goquery"
)

func scanSiteD(siteDBaseUrl string) []job.Job {
var newJobs = []job.Job{}
url := siteDBaseUrl + "/remote-jobs/developer/"
response, err := http.Get(url)
if err != nil {
log.Println("SiteD: Failed to get site", err)
return newJobs
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
log.Printf("SiteD: HTTP request failed with status: %s", response.Status)
return newJobs
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Println("SiteD: Failed to parse site", err)
return newJobs
}

doc.Find("a.card.m-0.border-left-0.border-right-0.border-top-0.border-bottom").Each(func(i int, s *goquery.Selection) {
jobURL, exists := s.Attr("href")
if exists {
jobTitle := s.Find(".font-weight-bold.larger").Text()
postTime := strings.TrimSpace(s.Find(".float-right.d-none.d-md-inline.text-secondary small").Text())
companyInfo := strings.TrimSpace(s.Find("p.m-0.text-secondary").First().Text())
company := strings.TrimSpace(strings.Split(companyInfo, "|")[0])

newJob := job.Job{
Title: jobTitle,
Link: siteDBaseUrl + jobURL,
Company: company,
}

if strings.Contains(postTime, "hours") {
newJobs = append(newJobs, newJob)
}
}
})

return newJobs
}

func getSiteDJobInfo(jobUrl string, proxyUrl string) (string, error) {
response, err := http.Get(proxyUrl + "/proxy?url=" + jobUrl)
if err != nil {
return "", err
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
return "", errors.New("HTTP request failed with status: " + response.Status)
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
return "", err
}

jobInfo := doc.Find("div.job_description").Text()
return jobInfo, nil
}

func ScanNewJobs(siteDBaseUrl string, proxyUrl string) []job.Job {
jobs := scanSiteD(siteDBaseUrl)
log.Println("siteD total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteDJobInfo)
log.Println("siteD interesting jobs", len(interestingJobs))
return interestingJobs
}
1 change: 1 addition & 0 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ module "scraper_lambda" {
"SCRAPER_SITEA_BASEURL" = "${var.SCRAPER_SITEA_BASEURL}"
"SCRAPER_SITEB_BASEURL" = "${var.SCRAPER_SITEB_BASEURL}"
"SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}"
"SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}"
}
}

Expand Down
4 changes: 4 additions & 0 deletions terraform/vars.tf
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ variable "SCRAPER_SITEC_BASEURL" {
sensitive = true
}

variable "SCRAPER_SITED_BASEURL" {
sensitive = true
}

variable "AWS_ACCOUNT_ID" {
sensitive = true
}
Expand Down

0 comments on commit 864cd98

Please sign in to comment.