Merge pull request #7 from austin1237/proxy-refactor

All requests now go through proxy
austin1237 · Feb 7, 2024 · bdbdc64 · bdbdc64
2 parents 864cd98 + dd3c524
commit bdbdc64
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 196 deletions.
diff --git a/go.work.sum b/go.work.sum
@@ -1,4 +1 @@
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
diff --git a/scraper/job/job.go b/scraper/job/job.go
@@ -1,5 +1,14 @@
 package job
 
+import (
+	"errors"
+	"log"
+	"net/http"
+	"net/url"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
 type Job struct {
 	Title   string
 	Company string
@@ -18,3 +27,44 @@ func DeduplicatedLinks(jobs []Job) []Job {
 	}
 	return deduplicated
 }
+
+func GetJobHtml(siteUrl string, proxyURL string) (*goquery.Document, error) {
+	response, err := http.Get(proxyURL + "/proxy?url=" + siteUrl)
+	if err != nil {
+		log.Println(siteUrl+": Failed to get site", err)
+		return nil, err
+	}
+	defer response.Body.Close()
+
+	if response.StatusCode != http.StatusOK {
+		err := errors.New(siteUrl + ": HTTP request failed with status: " + response.Status)
+		log.Println(err.Error())
+		return nil, err
+	}
+
+	// Parse the HTML document using goquery
+	doc, err := goquery.NewDocumentFromReader(response.Body)
+	if err != nil {
+		log.Println(siteUrl+": Failed to parse site", err)
+		return nil, err
+	}
+
+	return doc, nil
+
+}
+
+type parser func(string, *goquery.Document) []Job
+
+func GetNewJobs(siteUrl string, proxyURL string, jobParser parser) []Job {
+	u, err := url.Parse(siteUrl)
+	baseURL := u.Scheme + "://" + u.Host
+	if err != nil {
+		log.Println("Failed to parse url", err)
+		return []Job{}
+	}
+	doc, err := GetJobHtml(siteUrl, proxyURL)
+	if err != nil {
+		return []Job{}
+	}
+	return jobParser(baseURL, doc)
+}
diff --git a/scraper/job/job_test.go b/scraper/job/job_test.go
@@ -1,9 +1,15 @@
 package job
 
 import (
+	"net/http"
+	"net/http/httptest"
+	"strings"
 	"testing"
 
+	"github.com/PuerkitoBio/goquery"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
+	"github.com/stretchr/testify/require"
 )
 
 func TestDeduplicatedLinks(t *testing.T) {
@@ -19,3 +25,55 @@ func TestDeduplicatedLinks(t *testing.T) {
 	assert.Equal(t, "http://example.com/job1", deduplicated[0].Link, "Expected http://example.com/job1")
 	assert.Equal(t, "http://example.com/job2", deduplicated[1].Link, "Expected http://example.com/job2")
 }
+
+func TestGetJobHtml(t *testing.T) {
+	// Create a mock HTTP server
+	server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+		// Test that the proxy URL is correctly appended to the request URL
+		if strings.HasPrefix(req.URL.String(), "/proxy?url=") {
+			rw.Write([]byte("<html><body>Hello, World!</body></html>"))
+		} else {
+			http.Error(rw, "Invalid request URL", http.StatusBadRequest)
+		}
+	}))
+	defer server.Close()
+
+	// Test the GetJobHtml function
+	doc, err := GetJobHtml("https://example.com", server.URL)
+	require.NoError(t, err, "GetJobHtml returned an error")
+
+	// Test that the returned *goquery.Document has the correct HTML
+	require.Equal(t, "Hello, World!", doc.Find("body").Text(), "doc.Find(\"body\").Text() returned incorrect text")
+
+	// Test the GetJobHtml function with an invalid URL
+	_, err = GetJobHtml("invalid url", server.URL)
+	require.Error(t, err, "GetJobHtml should return an error for invalid URLs")
+}
+
+// MockJobParser is a mock job parser.
+type MockJobParser struct {
+	mock.Mock
+}
+
+// Parse is a mock method that returns a slice of jobs.
+func (m *MockJobParser) Parse(baseURL string, doc *goquery.Document) []Job {
+	args := m.Called(baseURL, doc)
+	return args.Get(0).([]Job)
+}
+
+func TestGetNewJobs(t *testing.T) {
+	// Create a mock HTTP server
+	server := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, req *http.Request) {
+		rw.Write([]byte("<html><body>Hello, World!</body></html>"))
+	}))
+	defer server.Close()
+
+	// Create a mock job parser
+	mockJobParser := new(MockJobParser)
+	mockJobParser.On("Parse", mock.Anything, mock.Anything).Return([]Job{{Title: "Test Job"}})
+
+	// Test the GetNewJobs function
+	jobs := GetNewJobs("https://example.com", server.URL, mockJobParser.Parse)
+	require.Len(t, jobs, 1, "GetNewJobs should return 1 job")
+	require.Equal(t, "Test Job", jobs[0].Title, "jobs[0].Title should be 'Test Job'")
+}
diff --git a/scraper/sitea/sitea.go b/scraper/sitea/sitea.go
@@ -1,9 +1,7 @@
 package sitea
 
 import (
-	"errors"
 	"log"
-	"net/http"
 	"scraper/interest"
 	"scraper/job"
 	"strconv"
@@ -12,95 +10,51 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )
 
-func scanSiteA(siteABaseUrl string) []job.Job {
-	possibleJobs := []job.Job{}
-	finished := false
-	page := 1
-
-	for !finished || page > 15 {
-		pageStr := strconv.Itoa(page)
-		url := siteABaseUrl + "/jobs/remote/nationwide/dev-engineering?page=" + pageStr
-		// Make an HTTP GET request
-		response, err := http.Get(url)
-		if err != nil {
-			log.Fatal(err)
+func siteAJobListParser(baseURL string, doc *goquery.Document) []job.Job {
+	newJobs := []job.Job{}
+	doc.Find("div[id^='job-card-']").Each(func(i int, s *goquery.Selection) {
+		recent := false
+		titleCheck := false
+		companyLink, _ := s.Find("a[href^='/company/']").Attr("href")
+		jobLink, _ := s.Find("a[id='job-card-alias']").Attr("href")
+		jobTitle := s.Find("a[id='job-card-alias']").Text()
+		timePosted := s.Find("span.font-barlow.text-gray-03").Text()
+
+		// Split the companyLink on '/' and get the last part
+		parts := strings.Split(companyLink, "/")
+		companyName := parts[len(parts)-1]
+
+		newJob := job.Job{
+			Link:    baseURL + jobLink,
+			Company: companyName,
+			Title:   jobTitle,
 		}
-		defer response.Body.Close()
 
-		if response.StatusCode != http.StatusOK {
-			log.Fatalf("HTTP request failed with status: %s", response.Status)
-		}
+		timePosted = strings.ToLower(timePosted)
+		jobTitle = strings.ToLower(jobTitle)
 
-		// Parse the HTML document using goquery
-		doc, err := goquery.NewDocumentFromReader(response.Body)
-		if err != nil {
-			log.Fatal(err)
+		if strings.Contains(timePosted, "hours ago") || strings.Contains(timePosted, "minutes ago") || strings.Contains(timePosted, "hour ago") {
+			recent = true
 		}
-		currentJobCount := len(possibleJobs)
-
-		doc.Find("div[id^='job-card-']").Each(func(i int, s *goquery.Selection) {
-			recent := false
-			titleCheck := false
-			companyLink, _ := s.Find("a[href^='/company/']").Attr("href")
-			jobLink, _ := s.Find("a[id='job-card-alias']").Attr("href")
-			jobTitle := s.Find("a[id='job-card-alias']").Text()
-			timePosted := s.Find("span.font-barlow.text-gray-03").Text()
-
-			// Split the companyLink on '/' and get the last part
-			parts := strings.Split(companyLink, "/")
-			companyName := parts[len(parts)-1]
-
-			newJob := job.Job{
-				Link:    siteABaseUrl + jobLink,
-				Company: companyName,
-				Title:   jobTitle,
-			}
-
-			timePosted = strings.ToLower(timePosted)
-			jobTitle = strings.ToLower(jobTitle)
-
-			if strings.Contains(timePosted, "hours ago") || strings.Contains(timePosted, "minutes ago") || strings.Contains(timePosted, "hour ago") {
-				recent = true
-			}
-
-			titles := []string{"software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer"}
 
-			for _, title := range titles {
-				if strings.Contains(jobTitle, title) {
-					titleCheck = true
-					break
-				}
-			}
+		titles := []string{"software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer"}
 
-			if recent && titleCheck {
-				possibleJobs = append(possibleJobs, newJob)
+		for _, title := range titles {
+			if strings.Contains(jobTitle, title) {
+				titleCheck = true
+				break
 			}
-		})
-		// No new jobs found were done
-		if currentJobCount == len(possibleJobs) {
-			finished = true
 		}
 
-		page++
-	}
-
-	return job.DeduplicatedLinks(possibleJobs)
+		if recent && titleCheck {
+			newJobs = append(newJobs, newJob)
+		}
+	})
+	return newJobs
 }
 
 func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) {
-	response, err := http.Get(proxyUrl + "/proxy?url=" + jobLink)
-	if err != nil {
-		return "", err
-	}
-	defer response.Body.Close()
-
-	if response.StatusCode != http.StatusOK {
-		err := errors.New("source HTTP request failed with status: " + response.Status)
-		return "", err
-	}
-
-	// Parse the HTML document using goquery
-	doc, err := goquery.NewDocumentFromReader(response.Body)
+	doc, err := job.GetJobHtml(jobLink, proxyUrl)
 	if err != nil {
 		return "", err
 	}
@@ -110,7 +64,23 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) {
 }
 
 func ScanNewJobs(siteABaseUrl string, proxyUrl string) []job.Job {
-	possibleJobs := scanSiteA(siteABaseUrl)
+	possibleJobs := []job.Job{}
+	finished := false
+	page := 1
+
+	for !finished || page > 15 {
+		currentJobCount := len(possibleJobs)
+		pageStr := strconv.Itoa(page)
+		url := siteABaseUrl + "/jobs/remote/nationwide/dev-engineering?page=" + pageStr
+		jobs := job.GetNewJobs(url, proxyUrl, siteAJobListParser)
+		possibleJobs = append(possibleJobs, jobs...)
+		// No new jobs found were done
+		if currentJobCount == len(possibleJobs) {
+			finished = true
+		}
+		page++
+	}
+
 	log.Println("siteA total jobs found", len(possibleJobs))
 	interestingJobs := interest.FilterInterest(proxyUrl, possibleJobs, GetSiteAJobInfo)
 	log.Println("siteA interesting jobs found", len(interestingJobs))

diff --git a/scraper/siteb/siteb.go b/scraper/siteb/siteb.go
@@ -1,9 +1,7 @@
 package siteb
 
 import (
-	"errors"
 	"log"
-	"net/http"
 	"scraper/interest"
 	"strings"
 
@@ -12,26 +10,8 @@ import (
 	"github.com/PuerkitoBio/goquery"
 )
 
-func scanSiteB(siteBBaseUrl string) []job.Job {
-	url := siteBBaseUrl + "/jobs"
-	response, err := http.Get(url)
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer response.Body.Close()
-
-	if response.StatusCode != http.StatusOK {
-		log.Fatalf("HTTP request failed with status: %s", response.Status)
-	}
-
-	// Parse the HTML document using goquery
-	doc, err := goquery.NewDocumentFromReader(response.Body)
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	var newJobs = []job.Job{}
-
+func siteBJobListParser(siteBBaseUrl string, doc *goquery.Document) []job.Job {
+	newJobs := []job.Job{}
 	// Find the div with class "row search-result"
 	doc.Find("div.row.search-result").Each(func(i int, s *goquery.Selection) {
 		// Extract the href attribute from the <a> element with rel="canonical"
@@ -57,22 +37,11 @@ func scanSiteB(siteBBaseUrl string) []job.Job {
 	})
 
 	return newJobs
+
 }
 
 func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) {
-	response, err := http.Get(proxyUrl + "/proxy?url=" + jobUrl)
-	if err != nil {
-		return "", err
-	}
-	defer response.Body.Close()
-
-	if response.StatusCode != http.StatusOK {
-		err := errors.New("source HTTP request failed with status: " + response.Status)
-		return "", err
-	}
-
-	// Parse the HTML document using goquery
-	doc, err := goquery.NewDocumentFromReader(response.Body)
+	doc, err := job.GetJobHtml(jobUrl, proxyUrl)
 	if err != nil {
 		return "", err
 	}
@@ -92,7 +61,7 @@ func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) {
 }
 
 func ScanNewJobs(sitebBaseUrl string, proxyUrl string) []job.Job {
-	jobs := scanSiteB(sitebBaseUrl)
+	jobs := job.GetNewJobs(sitebBaseUrl+"/jobs", proxyUrl, siteBJobListParser)
 	log.Println("siteB total jobs found", len(jobs))
 	interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteBJobInfo)
 	log.Println("siteB interesting jobs", len(interestingJobs))