From 671e30a60754df96dd27de2e421a717b16bf25f3 Mon Sep 17 00:00:00 2001 From: Austin Davis Date: Fri, 26 Apr 2024 21:58:47 -0600 Subject: [PATCH] scraper now goes through job api instead of accessing db/discord directly. --- scraper/cache/cache.go | 65 ---------------------- scraper/cache/cache_test.go | 63 --------------------- scraper/discord/discord.go | 72 ------------------------ scraper/discord/discord_test.go | 60 -------------------- scraper/dynamo/dynamo.go | 98 --------------------------------- scraper/dynamo/dynamo_test.go | 52 ----------------- scraper/go.mod | 2 - scraper/go.sum | 8 --- scraper/job/job.go | 29 ++++++++++ scraper/main.go | 40 +++++--------- scraper/remotive/remotive.go | 22 +------- scraper/sitea/sitea.go | 15 +---- scraper/siteb/siteb.go | 15 +---- scraper/sitec/sitec.go | 16 +----- scraper/sited/sited.go | 15 +---- scraper/sitee/sitee.go | 15 +---- scraper/sitef/sitef.go | 15 +---- terraform/main.tf | 3 +- 18 files changed, 64 insertions(+), 541 deletions(-) delete mode 100644 scraper/cache/cache.go delete mode 100644 scraper/cache/cache_test.go delete mode 100644 scraper/discord/discord.go delete mode 100644 scraper/discord/discord_test.go delete mode 100644 scraper/dynamo/dynamo.go delete mode 100644 scraper/dynamo/dynamo_test.go diff --git a/scraper/cache/cache.go b/scraper/cache/cache.go deleted file mode 100644 index 8f9b2cb..0000000 --- a/scraper/cache/cache.go +++ /dev/null @@ -1,65 +0,0 @@ -package cache - -import ( - "scraper/job" -) - -type Table interface { - ReadItem(company string) (string, error) - WriteItems(companies []string) -} - -type Cache struct { - table Table -} - -func NewCache(table Table) *Cache { - return &Cache{table: table} -} - -func (c *Cache) FilterCachedCompanies(jobs []job.Job) ([]job.Job, error) { - notInCache := make([]job.Job, 0) - errChan := make(chan error, len(jobs)) - notFoundChan := make(chan job.Job, len(jobs)) - foundChan := make(chan job.Job, len(jobs)) - - for _, newJob := range jobs { - go func(newJob job.Job) { - result, err := c.table.ReadItem(newJob.Company) - if result == "" { - // company is not in the cache - notFoundChan <- newJob - } else { - foundChan <- newJob - } - - if err != nil { - errChan <- err - } - - }(newJob) - } - - // Collect results from the goroutines - for range jobs { - select { - case job := <-notFoundChan: - notInCache = append(notInCache, job) - case <-foundChan: - // do nothing - case err := <-errChan: - return nil, err - } - - } - - return notInCache, nil -} - -func (c *Cache) WriteCompaniesToCache(jobs []job.Job) { - companies := make([]string, 0, len(jobs)) - for _, job := range jobs { - companies = append(companies, job.Company) - } - c.table.WriteItems(companies) -} diff --git a/scraper/cache/cache_test.go b/scraper/cache/cache_test.go deleted file mode 100644 index ebd5da8..0000000 --- a/scraper/cache/cache_test.go +++ /dev/null @@ -1,63 +0,0 @@ -package cache - -import ( - "scraper/job" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" -) - -type MockTable struct { - mock.Mock -} - -func (m *MockTable) ReadItem(company string) (string, error) { - args := m.Called(company) - return args.String(0), args.Error(1) -} - -func (m *MockTable) WriteItems(companies []string) { - m.Called(companies) -} - -func TestFilterCachedCompanies(t *testing.T) { - mockTable := new(MockTable) - mockTable.On("ReadItem", "Acme Corp").Return("Acme Corp", nil) - mockTable.On("ReadItem", "Globex Corporation").Return("", nil) - - cache := &Cache{ - table: mockTable, - } - - // Test the FilterCachedCompanies method - jobs := []job.Job{ - {Company: "Acme Corp"}, - {Company: "Globex Corporation"}, - } - notInCache, err := cache.FilterCachedCompanies(jobs) - - assert.NoError(t, err) - assert.Len(t, notInCache, 1) - assert.Equal(t, "Globex Corporation", notInCache[0].Company) - - mockTable.AssertExpectations(t) -} - -func TestWriteCompaniesToCache(t *testing.T) { - mockTable := new(MockTable) - mockTable.On("WriteItems", []string{"Acme Corp", "Globex Corporation"}).Return() - - cache := &Cache{ - table: mockTable, - } - - // Test the WriteCompaniesToCache method - jobs := []job.Job{ - {Company: "Acme Corp"}, - {Company: "Globex Corporation"}, - } - cache.WriteCompaniesToCache(jobs) - - mockTable.AssertExpectations(t) -} diff --git a/scraper/discord/discord.go b/scraper/discord/discord.go deleted file mode 100644 index f912a8b..0000000 --- a/scraper/discord/discord.go +++ /dev/null @@ -1,72 +0,0 @@ -package discord - -import ( - "bytes" - "encoding/json" - "net/http" - "scraper/job" -) - -func generateMessages(jobs []job.Job) []string { - var messages []string - var message bytes.Buffer - message.WriteString("```") - - for _, job := range jobs { - newLine := job.Link + ", " + job.Company + "\n" - // Discord has a 2000 character limit for messages - if message.Len()+len(newLine)+3 >= 2000 { // +3 for the ending "```" - message.WriteString("```") - messages = append(messages, message.String()) - message.Reset() - message.WriteString("```") - } - message.WriteString(newLine) - } - - if message.Len() > 0 { - message.WriteString("```") - messages = append(messages, message.String()) - } - - return messages -} - -func SendJobsToDiscord(jobs []job.Job, webhookURL string) []error { - if len(jobs) == 0 { - return nil - } - messages := generateMessages(jobs) - errorChannel := make(chan error, len(messages)) - - for _, message := range messages { - go func(message string) { - payload := map[string]string{ - "content": message, - } - - jsonPayload, err := json.Marshal(payload) - if err != nil { - errorChannel <- err - return - } - - resp, err := http.Post(webhookURL, "application/json", bytes.NewBuffer(jsonPayload)) - if err != nil { - errorChannel <- err - return - } - defer resp.Body.Close() - errorChannel <- nil - }(message) - } - - var errors []error - for i := 0; i < len(messages); i++ { - if err := <-errorChannel; err != nil { - errors = append(errors, err) - } - } - - return errors -} diff --git a/scraper/discord/discord_test.go b/scraper/discord/discord_test.go deleted file mode 100644 index 9769d5a..0000000 --- a/scraper/discord/discord_test.go +++ /dev/null @@ -1,60 +0,0 @@ -package discord - -import ( - "scraper/job" - "strings" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestGenerateMessages(t *testing.T) { - jobs := []job.Job{ - {Link: "http://example.com/job1", Company: "Company1"}, - {Link: "http://example.com/job2", Company: "Company2"}, - {Link: "http://example.com/job3", Company: "Company3"}, - // Add more jobs to test the 2000 character limit - } - - messages := generateMessages(jobs) - - // Check that each message is less than or equal to 2000 characters - for _, message := range messages { - assert.True(t, len(message) <= 2000, "Message length should be less than or equal to 2000 characters") - } - - // Check that all jobs are included in the messages - for _, job := range jobs { - jobLine := job.Link + ", " + job.Company - found := false - for _, message := range messages { - if strings.Contains(message, jobLine) { - found = true - break - } - } - assert.True(t, found, "All jobs should be included in the messages") - } -} - -func TestGenerateMessages_MultipleMessages(t *testing.T) { - // Create a job with a link and company name that together are 200 characters long - newJob := job.Job{ - Link: strings.Repeat("a", 100), // = 100 - Company: strings.Repeat("b", 97), // ", " and the ending "\n" is 3 characters, so 97 + 3 = 100 - } - - // Create 11 jobs, which should result in a total length of 2200 of job text characters - jobs := make([]job.Job, 11) - for i := range jobs { - jobs[i] = newJob - } - - messages := generateMessages(jobs) - - // Check that multiple messages were created - assert.True(t, len(messages) == 2, "Multiple messages should be created when the total length of the jobs exceeds 2000 characters") - // The addional 6 characters are the "```" and "```" characters at the start and end of the message - assert.True(t, len(messages[0]) == 1806, "The first message should be 1806 characters long") - assert.True(t, len(messages[1]) == 406, "The second message should be 406 characters long") -} diff --git a/scraper/dynamo/dynamo.go b/scraper/dynamo/dynamo.go deleted file mode 100644 index 39b00d1..0000000 --- a/scraper/dynamo/dynamo.go +++ /dev/null @@ -1,98 +0,0 @@ -package dynamo - -import ( - "log" - "strconv" - "strings" - "sync" - "time" - - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/dynamodb" -) - -type DynamoDBAPI interface { - UpdateItem(input *dynamodb.UpdateItemInput) (*dynamodb.UpdateItemOutput, error) - GetItem(input *dynamodb.GetItemInput) (*dynamodb.GetItemOutput, error) -} - -type Table struct { - Name string - svc DynamoDBAPI -} - -func NewTable(name string, region string) (*Table, error) { - sess, err := session.NewSession(&aws.Config{ - Region: aws.String(region), // replace with your region - }) - if err != nil { - return nil, err - } - - svc := dynamodb.New(sess) - - return &Table{Name: name, svc: svc}, nil -} - -func (t *Table) ReadItem(company string) (string, error) { - input := &dynamodb.GetItemInput{ - TableName: aws.String(t.Name), - Key: map[string]*dynamodb.AttributeValue{ - "company": { - S: aws.String(strings.ToLower(company)), - }, - }, - } - - result, err := t.svc.GetItem(input) - if err != nil { - return "", err - } - - if result.Item == nil { - return "", nil - } - - return *result.Item["company"].S, nil -} - -func (t *Table) WriteItems(companies []string) { - // Set the ttl time to 30 days from now - expirationTime := time.Now().AddDate(0, 1, 0).Unix() - - // Create a wait group - var wg sync.WaitGroup - - // Write each company to the table in a separate goroutine - for _, company := range companies { - wg.Add(1) - go func(company string) { - defer wg.Done() - - input := &dynamodb.UpdateItemInput{ - ExpressionAttributeValues: map[string]*dynamodb.AttributeValue{ - ":expirationTime": { - N: aws.String(strconv.FormatInt(expirationTime, 10)), - }, - }, - TableName: aws.String(t.Name), - Key: map[string]*dynamodb.AttributeValue{ - "company": { - S: aws.String(strings.ToLower(company)), - }, - }, - ReturnValues: aws.String("UPDATED_NEW"), - UpdateExpression: aws.String("set ExpirationTime = :expirationTime"), - } - - _, err := t.svc.UpdateItem(input) - if err != nil { - log.Println("Error writing company to cache", err) - } - }(company) - } - - // Wait for all goroutines to finish - wg.Wait() -} diff --git a/scraper/dynamo/dynamo_test.go b/scraper/dynamo/dynamo_test.go deleted file mode 100644 index ed8be07..0000000 --- a/scraper/dynamo/dynamo_test.go +++ /dev/null @@ -1,52 +0,0 @@ -package dynamo - -import ( - "testing" - - "github.com/aws/aws-sdk-go/service/dynamodb" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" -) - -type MockDynamoDB struct { - mock.Mock -} - -func (m *MockDynamoDB) UpdateItem(input *dynamodb.UpdateItemInput) (*dynamodb.UpdateItemOutput, error) { - args := m.Called(input) - return args.Get(0).(*dynamodb.UpdateItemOutput), args.Error(1) -} - -func (m *MockDynamoDB) GetItem(input *dynamodb.GetItemInput) (*dynamodb.GetItemOutput, error) { - args := m.Called(input) - return args.Get(0).(*dynamodb.GetItemOutput), args.Error(1) -} - -func TestNewTable(t *testing.T) { - table, err := NewTable("test", "us-west-2") - assert.NoError(t, err) - assert.NotNil(t, table) -} - -func TestReadItem(t *testing.T) { - mockSvc := new(MockDynamoDB) - table := &Table{Name: "test", svc: mockSvc} - - mockSvc.On("GetItem", mock.Anything).Return(&dynamodb.GetItemOutput{}, nil) - - _, err := table.ReadItem("Acme Corp") - assert.NoError(t, err) - - mockSvc.AssertExpectations(t) -} - -func TestWriteItems(t *testing.T) { - mockSvc := new(MockDynamoDB) - table := &Table{Name: "test", svc: mockSvc} - - mockSvc.On("UpdateItem", mock.Anything).Return(&dynamodb.UpdateItemOutput{}, nil) - - table.WriteItems([]string{"Acme Corp", "Globex Corporation"}) - - mockSvc.AssertExpectations(t) -} diff --git a/scraper/go.mod b/scraper/go.mod index f38ec56..25ab6bb 100644 --- a/scraper/go.mod +++ b/scraper/go.mod @@ -5,14 +5,12 @@ go 1.20 require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/aws/aws-lambda-go v1.45.0 - github.com/aws/aws-sdk-go v1.50.29 github.com/stretchr/testify v1.7.2 ) require ( github.com/andybalholm/cascadia v1.3.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect - github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/stretchr/objx v0.1.0 // indirect golang.org/x/net v0.17.0 // indirect diff --git a/scraper/go.sum b/scraper/go.sum index 2467a5c..3270977 100644 --- a/scraper/go.sum +++ b/scraper/go.sum @@ -4,15 +4,9 @@ github.com/andybalholm/cascadia v1.3.1 h1:nhxRkql1kdYCc8Snf7D5/D3spOX+dBgjA6u8x0 github.com/andybalholm/cascadia v1.3.1/go.mod h1:R4bJ1UQfqADjvDa4P6HZHLh/3OxWWEqc0Sk8XGwHqvA= github.com/aws/aws-lambda-go v1.45.0 h1:3xS35Dlc8ffmcwfcKTyqJGiMuL0UDvkQaVUrI5yHycI= github.com/aws/aws-lambda-go v1.45.0/go.mod h1:dpMpZgvWx5vuQJfBt0zqBha60q7Dd7RfgJv23DymV8A= -github.com/aws/aws-sdk-go v1.50.29 h1:Ol2FYzesF2tsQrgVSnDWRFI60+FsSqKKdt7MLlZKubc= -github.com/aws/aws-sdk-go v1.50.29/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= -github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= -github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= -github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4= @@ -53,7 +47,5 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= -gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/scraper/job/job.go b/scraper/job/job.go index 67e6bfd..f51a91d 100644 --- a/scraper/job/job.go +++ b/scraper/job/job.go @@ -1,6 +1,8 @@ package job import ( + "bytes" + "encoding/json" "errors" "log" "net/http" @@ -15,6 +17,12 @@ type Job struct { Link string } +type Response struct { + Total int `json:"total"` + Uncached int `json:"uncached"` + Duplicates int `json:"duplicates"` +} + func DeduplicatedLinks(jobs []Job) []Job { seen := make(map[string]bool) deduplicated := []Job{} @@ -80,3 +88,24 @@ func GetNewJobs(siteUrl string, proxyURL string, jobParser parser, optionalMode } return jobParser(baseURL, doc) } + +func SendJobs(jobURL string, jobs []Job) (Response, error) { + var response Response + jsonData, err := json.Marshal(map[string][]Job{"jobs": jobs}) + if err != nil { + return response, err + } + + resp, err := http.Post(jobURL+"/job", "application/json", bytes.NewBuffer(jsonData)) + if err != nil { + return response, err + } + defer resp.Body.Close() + + err = json.NewDecoder(resp.Body).Decode(&response) + if err != nil { + return response, err + } + + return response, nil +} diff --git a/scraper/main.go b/scraper/main.go index 08100e9..3cc028c 100644 --- a/scraper/main.go +++ b/scraper/main.go @@ -4,9 +4,6 @@ import ( "context" "log" "os" - "scraper/cache" - "scraper/discord" - "scraper/dynamo" "scraper/job" "scraper/remotive" "scraper/sitea" @@ -21,7 +18,7 @@ import ( ) type Site struct { - ScanNewJobs func(string, string, *cache.Cache) ([]job.Job, []job.Job) + ScanNewJobs func(string, string) []job.Job BaseURL string } @@ -32,14 +29,13 @@ type Result struct { var ( proxyURL string - scraperWebhook string scraperSiteABaseURL string scraperSiteBBaseURL string scraperSiteCBaseURL string scraperSiteDBaseURL string scraperSiteEBaseURL string scraperSiteFBaseURL string - dynamoTable string + jobURL string ) func init() { @@ -48,11 +44,6 @@ func init() { log.Fatal("Environment variable PROXY_URL must be set") } - scraperWebhook = os.Getenv("SCRAPER_WEBHOOK") - if scraperWebhook == "" { - log.Fatal("Environment variable SCRAPER_WEBHOOK must be set") - } - scraperSiteABaseURL = os.Getenv("SCRAPER_SITEA_BASEURL") if scraperSiteABaseURL == "" { log.Fatal("Environment variable SCRAPER_SITEA_BASEURL must be set") @@ -83,20 +74,14 @@ func init() { log.Fatal("Environment variable SCRAPER_SITEF_BASEURL must be set") } - dynamoTable = os.Getenv("DYNAMO_TABLE") - if dynamoTable == "" { - log.Fatal("Environment variable DYNAMO_TABLE must be set") + jobURL = os.Getenv("JOB_URL") + if jobURL == "" { + log.Fatal("Environment variable JOB_URL must be set") } } func lookForNewJobs() { - table, err := dynamo.NewTable(dynamoTable, "us-east-1") // replace with your table name - if err != nil { - log.Fatal(err) - } - - cache := cache.NewCache(table) var sites = []Site{ {ScanNewJobs: sitea.ScanNewJobs, BaseURL: scraperSiteABaseURL}, {ScanNewJobs: siteb.ScanNewJobs, BaseURL: scraperSiteBBaseURL}, @@ -113,14 +98,15 @@ func lookForNewJobs() { for _, site := range sites { go func(site Site) { start := time.Now() - uncachedJobs, interestingJobs := site.ScanNewJobs(site.BaseURL, proxyURL, cache) - errs := discord.SendJobsToDiscord(interestingJobs, scraperWebhook) - if len(errs) == 0 { - cache.WriteCompaniesToCache(uncachedJobs) - } else { - log.Println("Error sending to discord", errs) - } + interestingJobs := site.ScanNewJobs(site.BaseURL, proxyURL) + results, err := job.SendJobs(jobURL, interestingJobs) elapsed := time.Since(start) + if err != nil { + log.Println("Error sending to job api", err) + doneChannel <- Result{Elapsed: elapsed, URL: site.BaseURL} + return + } + log.Println(site.BaseURL+", Total Jobs: ", results.Total, "Uncached Jobs: ", results.Uncached, "Duplicates: ", results.Duplicates) doneChannel <- Result{Elapsed: elapsed, URL: site.BaseURL} }(site) } diff --git a/scraper/remotive/remotive.go b/scraper/remotive/remotive.go index 94dc61b..8cbf438 100644 --- a/scraper/remotive/remotive.go +++ b/scraper/remotive/remotive.go @@ -4,7 +4,6 @@ import ( "encoding/json" "log" "net/http" - "scraper/cache" "scraper/interest" "scraper/job" "time" @@ -72,26 +71,10 @@ func callApi(site string) []remotiveJob { return newJobs } -func ScanNewJobs(baseURL string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(baseURL string, proxyUrl string) []job.Job { remotiveJobs := callApi(baseURL + "/api/remote-jobs?category=software-dev&limit=100") log.Println("Remotive total jobs found", len(remotiveJobs)) var interestingJobs []job.Job - var newJobs []job.Job - - for _, newJob := range remotiveJobs { - newJobs = append(newJobs, job.Job{ - Title: newJob.Title, - Link: newJob.URL, - Company: newJob.CompanyName, - }) - } - - unCachedJobs, err := cache.FilterCachedCompanies(newJobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(baseURL+" total jobs not found in cache", len(unCachedJobs)) - for _, newJob := range remotiveJobs { if interest.CheckIfInterested(newJob.Description) { interestingJobs = append(interestingJobs, job.Job{ @@ -101,6 +84,7 @@ func ScanNewJobs(baseURL string, proxyUrl string, cache *cache.Cache) ([]job.Job }) } } + log.Println("Remotive interesting jobs", len(interestingJobs)) - return unCachedJobs, interestingJobs + return interestingJobs } diff --git a/scraper/sitea/sitea.go b/scraper/sitea/sitea.go index 13b4a4c..45d2487 100644 --- a/scraper/sitea/sitea.go +++ b/scraper/sitea/sitea.go @@ -1,8 +1,6 @@ package sitea import ( - "log" - "scraper/cache" "scraper/interest" "scraper/job" "strconv" @@ -65,7 +63,7 @@ func GetSiteAJobInfo(jobLink string, proxyUrl string) (string, error) { return jobDescription, nil } -func ScanNewJobs(siteABaseUrl string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(siteABaseUrl string, proxyUrl string) []job.Job { var wg sync.WaitGroup jobsChan := make(chan []job.Job) @@ -101,13 +99,6 @@ func ScanNewJobs(siteABaseUrl string, proxyUrl string, cache *cache.Cache) ([]jo possibleJobs = append(possibleJobs, jobs...) } - log.Println(siteABaseUrl+" total jobs found", len(possibleJobs)) - unCachedJobs, err := cache.FilterCachedCompanies(possibleJobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(siteABaseUrl+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyUrl, unCachedJobs, GetSiteAJobInfo) - log.Println(siteABaseUrl+" interesting jobs found", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyUrl, possibleJobs, GetSiteAJobInfo) + return interestingJobs } diff --git a/scraper/siteb/siteb.go b/scraper/siteb/siteb.go index 2569b02..50ade7c 100644 --- a/scraper/siteb/siteb.go +++ b/scraper/siteb/siteb.go @@ -1,8 +1,6 @@ package siteb import ( - "log" - "scraper/cache" "scraper/interest" "strings" @@ -61,15 +59,8 @@ func getSiteBJobInfo(jobUrl string, proxyUrl string) (string, error) { return description, nil } -func ScanNewJobs(sitebBaseUrl string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(sitebBaseUrl string, proxyUrl string) []job.Job { jobs := job.GetNewJobs(sitebBaseUrl+"/jobs", proxyUrl, siteBJobListParser) - log.Println(sitebBaseUrl+" total jobs found", len(jobs)) - unCachedJobs, err := cache.FilterCachedCompanies(jobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(sitebBaseUrl+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyUrl, unCachedJobs, getSiteBJobInfo) - log.Println(sitebBaseUrl+" interesting jobs", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteBJobInfo) + return interestingJobs } diff --git a/scraper/sitec/sitec.go b/scraper/sitec/sitec.go index ca522ea..07f77f2 100644 --- a/scraper/sitec/sitec.go +++ b/scraper/sitec/sitec.go @@ -1,9 +1,6 @@ package sitec import ( - "log" - - "scraper/cache" "scraper/interest" "scraper/job" @@ -48,7 +45,7 @@ func getSiteCJobInfo(jobUrl string, proxyUrl string) (string, error) { return description, nil } -func ScanNewJobs(sitecBaseUrl string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(sitecBaseUrl string, proxyUrl string) []job.Job { var jobs = []job.Job{} jobChannel := make(chan []job.Job, 2) @@ -66,13 +63,6 @@ func ScanNewJobs(sitecBaseUrl string, proxyUrl string, cache *cache.Cache) ([]jo jobs = append(jobs, <-jobChannel...) } - log.Println(sitecBaseUrl+" total jobs found", len(jobs)) - unCachedJobs, err := cache.FilterCachedCompanies(jobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(sitecBaseUrl+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyUrl, unCachedJobs, getSiteCJobInfo) - log.Println(sitecBaseUrl+" interesting jobs", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteCJobInfo) + return interestingJobs } diff --git a/scraper/sited/sited.go b/scraper/sited/sited.go index 985a14b..d054705 100644 --- a/scraper/sited/sited.go +++ b/scraper/sited/sited.go @@ -1,8 +1,6 @@ package sited import ( - "log" - "scraper/cache" "scraper/interest" "scraper/job" "strings" @@ -43,15 +41,8 @@ func getSiteDJobInfo(jobUrl string, proxyUrl string) (string, error) { return jobInfo, nil } -func ScanNewJobs(siteDBaseUrl string, proxyUrl string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(siteDBaseUrl string, proxyUrl string) []job.Job { jobs := job.GetNewJobs(siteDBaseUrl+"/remote-jobs/developer/", proxyUrl, siteDJobListParser) - log.Println(siteDBaseUrl+" total jobs found", len(jobs)) - unCachedJobs, err := cache.FilterCachedCompanies(jobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(siteDBaseUrl+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyUrl, unCachedJobs, getSiteDJobInfo) - log.Println(siteDBaseUrl+" interesting jobs", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyUrl, jobs, getSiteDJobInfo) + return interestingJobs } diff --git a/scraper/sitee/sitee.go b/scraper/sitee/sitee.go index b1aa61e..5832346 100644 --- a/scraper/sitee/sitee.go +++ b/scraper/sitee/sitee.go @@ -1,8 +1,6 @@ package sitee import ( - "log" - "scraper/cache" "scraper/interest" "scraper/job" "strings" @@ -47,15 +45,8 @@ func getSiteEJobInfo(jobUrl string, proxyUrl string) (string, error) { return jobInfo, nil } -func ScanNewJobs(baseURL string, proxyURL string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(baseURL string, proxyURL string) []job.Job { jobs := job.GetNewJobs(baseURL+"/category/development", proxyURL, siteEJobListParser) - log.Println(baseURL+" total jobs found", len(jobs)) - unCachedJobs, err := cache.FilterCachedCompanies(jobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(baseURL+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyURL, unCachedJobs, getSiteEJobInfo) - log.Println(baseURL+" total interesting jobs found", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteEJobInfo) + return interestingJobs } diff --git a/scraper/sitef/sitef.go b/scraper/sitef/sitef.go index 8a55521..9103e32 100644 --- a/scraper/sitef/sitef.go +++ b/scraper/sitef/sitef.go @@ -1,8 +1,6 @@ package sitef import ( - "log" - "scraper/cache" "scraper/interest" "scraper/job" "strings" @@ -40,16 +38,9 @@ func getSiteFJobInfo(jobUrl string, proxyUrl string) (string, error) { return text, nil } -func ScanNewJobs(baseURL string, proxyURL string, cache *cache.Cache) ([]job.Job, []job.Job) { +func ScanNewJobs(baseURL string, proxyURL string) []job.Job { subUrl := "/jobs?category=development&location=north-america&positionType=full-time" jobs := job.GetNewJobs(baseURL+subUrl, proxyURL, siteFJobListParser, "headless") - log.Println(baseURL+" total jobs found", len(jobs)) - unCachedJobs, err := cache.FilterCachedCompanies(jobs) - if err != nil { - log.Println("Error filtering cached companies", err) - } - log.Println(baseURL+" total jobs not found in cache", len(unCachedJobs)) - interestingJobs := interest.FilterInterest(proxyURL, unCachedJobs, getSiteFJobInfo) - log.Println(baseURL+" total interesting jobs found", len(interestingJobs)) - return unCachedJobs, interestingJobs + interestingJobs := interest.FilterInterest(proxyURL, jobs, getSiteFJobInfo) + return interestingJobs } diff --git a/terraform/main.tf b/terraform/main.tf index 447034d..e4826cd 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -28,14 +28,13 @@ module "scraper_lambda" { timeout = 300 env_vars = { "PROXY_URL" = "${module.proxy_gateway.api_url}" - "SCRAPER_WEBHOOK" = "${var.SCRAPER_WEBHOOK}" "SCRAPER_SITEA_BASEURL" = "${var.SCRAPER_SITEA_BASEURL}" "SCRAPER_SITEB_BASEURL" = "${var.SCRAPER_SITEB_BASEURL}" "SCRAPER_SITEC_BASEURL" = "${var.SCRAPER_SITEC_BASEURL}" "SCRAPER_SITED_BASEURL" = "${var.SCRAPER_SITED_BASEURL}" "SCRAPER_SITEE_BASEURL" = "${var.SCRAPER_SITEE_BASEURL}" "SCRAPER_SITEF_BASEURL" = "${var.SCRAPER_SITEF_BASEURL}" - "DYNAMO_TABLE" = "${aws_dynamodb_table.job_scraper_company_cache.name}" + "JOB_URL" = "${module.job_gateway.api_url}" } }