Skip to content

Commit

Permalink
Merge pull request #7 from austin1237/proxy-refactor
Browse files Browse the repository at this point in the history
All requests now go through proxy
  • Loading branch information
austin1237 authored Feb 7, 2024
2 parents 864cd98 + dd3c524 commit bdbdc64
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 196 deletions.
5 changes: 1 addition & 4 deletions go.work.sum
Original file line number Diff line number Diff line change
@@ -1,4 +1 @@
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
50 changes: 50 additions & 0 deletions scraper/job/job.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
package job

import (
"errors"
"log"
"net/http"
"net/url"

"github.com/PuerkitoBio/goquery"
)

type Job struct {
Title string
Company string
Expand All @@ -18,3 +27,44 @@ func DeduplicatedLinks(jobs []Job) []Job {
}
return deduplicated
}

func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) {
response, err := http.Get(proxyURL + "/proxy?url=" + siteUrl)
if err != nil {
log.Println(siteUrl+": Failed to get site", err)
return nil, err
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
err := errors.New(siteUrl + ": HTTP request failed with status: " + response.Status)
log.Println(err.Error())
return nil, err
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Println(siteUrl+": Failed to parse site", err)
return nil, err
}

return doc, nil

}

type parser func(string, *goquery.Document) []Job

func GetNewJobs(siteUrl string, proxyURL string, jobParser parser) []Job {
u, err := url.Parse(siteUrl)
baseURL := u.Scheme + "://" + u.Host
if err != nil {
log.Println("Failed to parse url", err)
return []Job{}
}
doc, err := GetJobHtml(siteUrl, proxyURL)
if err != nil {
return []Job{}
}
return jobParser(baseURL, doc)
}
58 changes: 58 additions & 0 deletions scraper/job/job_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
package job

import (
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/PuerkitoBio/goquery"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/require"
)

func TestDeduplicatedLinks(t *testing.T) {
Expand All @@ -19,3 +25,55 @@ func TestDeduplicatedLinks(t *testing.T) {
assert.Equal(t, "http://example.com/job1", deduplicated[0].Link, "Expected http://example.com/job1")
assert.Equal(t, "http://example.com/job2", deduplicated[1].Link, "Expected http://example.com/job2")
}

func TestGetJobHtml(t *testing.T) {
// Create a mock HTTP server
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
// Test that the proxy URL is correctly appended to the request URL
if strings.HasPrefix(req.URL.String(), "/proxy?url=") {
rw.Write([]byte("<html><body>Hello, World!</body></html>"))
} else {
http.Error(rw, "Invalid request URL", http.StatusBadRequest)
}
}))
defer server.Close()

// Test the GetJobHtml function
doc, err := GetJobHtml("https://example.com", server.URL)
require.NoError(t, err, "GetJobHtml returned an error")

// Test that the returned *goquery.Document has the correct HTML
require.Equal(t, "Hello, World!", doc.Find("body").Text(), "doc.Find(\"body\").Text() returned incorrect text")

// Test the GetJobHtml function with an invalid URL
_, err = GetJobHtml("invalid url", server.URL)
require.Error(t, err, "GetJobHtml should return an error for invalid URLs")
}

// MockJobParser is a mock job parser.
type MockJobParser struct {
mock.Mock
}

// Parse is a mock method that returns a slice of jobs.
func (m *MockJobParser) Parse(baseURL string, doc *goquery.Document) []Job {
args := m.Called(baseURL, doc)
return args.Get(0).([]Job)
}

func TestGetNewJobs(t *testing.T) {
// Create a mock HTTP server
server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
rw.Write([]byte("<html><body>Hello, World!</body></html>"))
}))
defer server.Close()

// Create a mock job parser
mockJobParser := new(MockJobParser)
mockJobParser.On("Parse", mock.Anything, mock.Anything).Return([]Job{{Title: "Test Job"}})

// Test the GetNewJobs function
jobs := GetNewJobs("https://example.com", server.URL, mockJobParser.Parse)
require.Len(t, jobs, 1, "GetNewJobs should return 1 job")
require.Equal(t, "Test Job", jobs[0].Title, "jobs[0].Title should be 'Test Job'")
}
130 changes: 50 additions & 80 deletions scraper/sitea/sitea.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package sitea

import (
"errors"
"log"
"net/http"
"scraper/interest"
"scraper/job"
"strconv"
Expand All @@ -12,95 +10,51 @@ import (
"github.com/PuerkitoBio/goquery"
)

func scanSiteA(siteABaseUrl string) []job.Job {
possibleJobs := []job.Job{}
finished := false
page := 1

for !finished || page > 15 {
pageStr := strconv.Itoa(page)
url := siteABaseUrl + "/jobs/remote/nationwide/dev-engineering?page=" + pageStr
// Make an HTTP GET request
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
func siteAJobListParser(baseURL string, doc *goquery.Document) []job.Job {
newJobs := []job.Job{}
doc.Find("div[id^='job-card-']").Each(func(i int, s *goquery.Selection) {
recent := false
titleCheck := false
companyLink, _ := s.Find("a[href^='/company/']").Attr("href")
jobLink, _ := s.Find("a[id='job-card-alias']").Attr("href")
jobTitle := s.Find("a[id='job-card-alias']").Text()
timePosted := s.Find("span.font-barlow.text-gray-03").Text()

// Split the companyLink on '/' and get the last part
parts := strings.Split(companyLink, "/")
companyName := parts[len(parts)-1]

newJob := job.Job{
Link: baseURL + jobLink,
Company: companyName,
Title: jobTitle,
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
log.Fatalf("HTTP request failed with status: %s", response.Status)
}
timePosted = strings.ToLower(timePosted)
jobTitle = strings.ToLower(jobTitle)

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal(err)
if strings.Contains(timePosted, "hours ago") || strings.Contains(timePosted, "minutes ago") || strings.Contains(timePosted, "hour ago") {
recent = true
}
currentJobCount := len(possibleJobs)

doc.Find("div[id^='job-card-']").Each(func(i int, s *goquery.Selection) {
recent := false
titleCheck := false
companyLink, _ := s.Find("a[href^='/company/']").Attr("href")
jobLink, _ := s.Find("a[id='job-card-alias']").Attr("href")
jobTitle := s.Find("a[id='job-card-alias']").Text()
timePosted := s.Find("span.font-barlow.text-gray-03").Text()

// Split the companyLink on '/' and get the last part
parts := strings.Split(companyLink, "/")
companyName := parts[len(parts)-1]

newJob := job.Job{
Link: siteABaseUrl + jobLink,
Company: companyName,
Title: jobTitle,
}

timePosted = strings.ToLower(timePosted)
jobTitle = strings.ToLower(jobTitle)

if strings.Contains(timePosted, "hours ago") || strings.Contains(timePosted, "minutes ago") || strings.Contains(timePosted, "hour ago") {
recent = true
}

titles := []string{"software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer"}

for _, title := range titles {
if strings.Contains(jobTitle, title) {
titleCheck = true
break
}
}
titles := []string{"software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer"}

if recent && titleCheck {
possibleJobs = append(possibleJobs, newJob)
for _, title := range titles {
if strings.Contains(jobTitle, title) {
titleCheck = true
break
}
})
// No new jobs found were done
if currentJobCount == len(possibleJobs) {
finished = true
}

page++
}

return job.DeduplicatedLinks(possibleJobs)
if recent && titleCheck {
newJobs = append(newJobs, newJob)
}
})
return newJobs
}

func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) {
response, err := http.Get(proxyUrl + "/proxy?url=" + jobLink)
if err != nil {
return "", err
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
err := errors.New("source HTTP request failed with status: " + response.Status)
return "", err
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
doc, err := job.GetJobHtml(jobLink, proxyUrl)
if err != nil {
return "", err
}
Expand All @@ -110,7 +64,23 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) {
}

func ScanNewJobs(siteABaseUrl string, proxyUrl string) []job.Job {
possibleJobs := scanSiteA(siteABaseUrl)
possibleJobs := []job.Job{}
finished := false
page := 1

for !finished || page > 15 {
currentJobCount := len(possibleJobs)
pageStr := strconv.Itoa(page)
url := siteABaseUrl + "/jobs/remote/nationwide/dev-engineering?page=" + pageStr
jobs := job.GetNewJobs(url, proxyUrl, siteAJobListParser)
possibleJobs = append(possibleJobs, jobs...)
// No new jobs found were done
if currentJobCount == len(possibleJobs) {
finished = true
}
page++
}

log.Println("siteA total jobs found", len(possibleJobs))
interestingJobs := interest.FilterInterest(proxyUrl, possibleJobs, GetSiteAJobInfo)
log.Println("siteA interesting jobs found", len(interestingJobs))
Expand Down
41 changes: 5 additions & 36 deletions scraper/siteb/siteb.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package siteb

import (
"errors"
"log"
"net/http"
"scraper/interest"
"strings"

Expand All @@ -12,26 +10,8 @@ import (
"github.com/PuerkitoBio/goquery"
)

func scanSiteB(siteBBaseUrl string) []job.Job {
url := siteBBaseUrl + "/jobs"
response, err := http.Get(url)
if err != nil {
log.Fatal(err)
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
log.Fatalf("HTTP request failed with status: %s", response.Status)
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
if err != nil {
log.Fatal(err)
}

var newJobs = []job.Job{}

func siteBJobListParser(siteBBaseUrl string, doc *goquery.Document) []job.Job {
newJobs := []job.Job{}
// Find the div with class "row search-result"
doc.Find("div.row.search-result").Each(func(i int, s *goquery.Selection) {
// Extract the href attribute from the <a> element with rel="canonical"
Expand All @@ -57,22 +37,11 @@ func scanSiteB(siteBBaseUrl string) []job.Job {
})

return newJobs

}

func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) {
response, err := http.Get(proxyUrl + "/proxy?url=" + jobUrl)
if err != nil {
return "", err
}
defer response.Body.Close()

if response.StatusCode != http.StatusOK {
err := errors.New("source HTTP request failed with status: " + response.Status)
return "", err
}

// Parse the HTML document using goquery
doc, err := goquery.NewDocumentFromReader(response.Body)
doc, err := job.GetJobHtml(jobUrl, proxyUrl)
if err != nil {
return "", err
}
Expand All @@ -92,7 +61,7 @@ func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) {
}

func ScanNewJobs(sitebBaseUrl string, proxyUrl string) []job.Job {
jobs := scanSiteB(sitebBaseUrl)
jobs := job.GetNewJobs(sitebBaseUrl+"/jobs", proxyUrl, siteBJobListParser)
log.Println("siteB total jobs found", len(jobs))
interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteBJobInfo)
log.Println("siteB interesting jobs", len(interestingJobs))
Expand Down
Loading

0 comments on commit bdbdc64

Please sign in to comment.