-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #19 from austin1237/local
Local scraper
- Loading branch information
Showing
12 changed files
with
6,160 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
/** @type {import('ts-jest').JestConfigWithTsJest} */ | ||
module.exports = { | ||
preset: 'ts-jest', | ||
testEnvironment: 'node', | ||
}; |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"scripts": { | ||
"start": "npx tsc && node --enable-source-maps ./build/src/index.js", | ||
"test": "jest" | ||
}, | ||
"devDependencies": { | ||
"@types/jest": "^29.5.12", | ||
"jest": "^29.7.0", | ||
"ts-jest": "^29.1.2", | ||
"typescript": "^5.3.3" | ||
}, | ||
"dependencies": { | ||
"puppeteer": "^22.4.0" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import fs from 'fs'; | ||
|
||
// Create a text document named with today's date | ||
const date = new Date(); | ||
|
||
const fileName = `${date.getFullYear()}-${date.getMonth() + 1}-${date.getDate()}-jobs.txt`; | ||
|
||
export function writeToFile(pageUrl: string, jobTitle: string | null, companyName: string | null, matchedKeyword: string) { | ||
const fileContent = `${pageUrl}, ${jobTitle}, ${companyName}, ${matchedKeyword}\n`; | ||
fs.appendFileSync(fileName, fileContent); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import puppeteer from 'puppeteer'; | ||
import { scrap } from './scraper/sitea'; | ||
|
||
const main = async () =>{ | ||
let jobCount = 0; | ||
const link = process.env.LOCAL_SCRAPER_SITEA; | ||
if (!link) { | ||
console.error('LOCAL_SCRAPER_SITEA environment variable isnt set'); | ||
return; | ||
} | ||
const browser = await puppeteer.launch({ headless: false }); | ||
await scrap(browser, link, jobCount); | ||
console.log('Scraping complete'); | ||
await browser.close(); | ||
} | ||
|
||
main().catch(error => { | ||
console.error(error); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
const includedKeywords = ["node", "nodejs", "node.js", "go", "golang", "typescript"]; | ||
const excludedKeywords = ["contract", "web3", "blockchain", "crypto"] | ||
const includeTitles = ["software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer", "api"]; | ||
const excludeTitles = ["front-end", "front end", "frontend", ".net", "java", "manager", "lead", "staff", "principal", "contract", "c#", "microsoft"]; | ||
const includeRegex = new RegExp(includeTitles.join("|"), "i"); | ||
const excludeRegex = new RegExp(excludeTitles.join("|"), "i"); | ||
const excludeCompanyRegex = /(consulting|recruiting)/i; | ||
|
||
export function evaluateJobInterest(jobTitle: string | null, companyName: string | null, jobDescriptionText: string | null ): string | null{ | ||
|
||
let matchedKeyword = null; | ||
let matchedExcludedKeyword = null; | ||
|
||
if (!companyName || !jobTitle || !jobDescriptionText){ | ||
return null | ||
} | ||
|
||
for (const keyword of includedKeywords) { | ||
const regex = new RegExp("\\b" + keyword + "\\b", "i"); | ||
if (jobDescriptionText && regex.test(jobDescriptionText)) { | ||
matchedKeyword = keyword; | ||
break; | ||
} | ||
} | ||
|
||
if (!matchedKeyword){ | ||
return null | ||
} | ||
|
||
for (const keyword of excludedKeywords) { | ||
const regex = new RegExp("\\b" + keyword + "\\b", "i"); | ||
if (jobDescriptionText && regex.test(jobDescriptionText)) { | ||
matchedExcludedKeyword = keyword; | ||
break; | ||
} | ||
} | ||
|
||
if (matchedExcludedKeyword) { | ||
return null; | ||
} | ||
|
||
if(!includeRegex.test(jobTitle) || excludeRegex.test(jobTitle) || excludeCompanyRegex.test(companyName)){ | ||
return null | ||
} | ||
|
||
return matchedKeyword; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import { Browser, Page } from 'puppeteer'; | ||
import { evaluateJobInterest } from '../interest'; | ||
import { writeToFile } from '../fileHandler'; | ||
const urls: string[] = []; | ||
|
||
const didUrlChange = async (page: Page): Promise<boolean> => { | ||
|
||
const url: string = page.url(); | ||
if (urls.includes(url)) { | ||
// sleep for one second and check again | ||
console.log('page url hasn\'t changed, sleeping for 3 second'); | ||
await new Promise(resolve => setTimeout(resolve, 3000)); | ||
return didUrlChange(page); | ||
} | ||
|
||
urls.push(url); | ||
return true; | ||
}; | ||
|
||
|
||
export const scrap = async(browser : Browser, link : string, jobCount : number) => { | ||
try{ | ||
const page = await browser.newPage(); | ||
await page.goto(link); | ||
await page.waitForSelector('.slider_container'); | ||
|
||
while (true) { | ||
const containers = await page.$$('.slider_container'); | ||
for (const container of containers) { | ||
const h2 = await container.$('h2'); | ||
if (h2) { | ||
await h2.click(); | ||
const className = await page.evaluate(element => element.className, h2); | ||
if (className.startsWith('jobTitle')) { | ||
|
||
// Waiting for the new job to load on the subpage | ||
await page.waitForSelector('.slider_container', { visible: true }); | ||
await page.waitForSelector('#jobDescriptionText'); | ||
|
||
// subpage should have loaded by now confirm by checking the url | ||
await didUrlChange(page); | ||
|
||
const jobTitle = await page.evaluate(element => element.textContent, h2); // Get the h2 text | ||
let companyName = null; | ||
|
||
// company isn't always set for w/e reason | ||
try { | ||
companyName = await container.$eval('[data-testid="company-name"]', element => element.textContent); // Get the company name | ||
} catch (error) { | ||
companyName = 'noCompanyFound'; | ||
} | ||
|
||
const jobDescriptionText = await page.$eval('#jobDescriptionText', element => element.textContent); | ||
const pageUrl = urls[urls.length - 1]; // Get the current page URL | ||
const jobCategory = evaluateJobInterest(jobTitle, companyName, jobDescriptionText); | ||
|
||
if(jobCategory){ | ||
writeToFile(pageUrl, jobTitle, companyName, jobCategory); | ||
jobCount++; | ||
console.log(`Job found ${jobCount}`); | ||
} | ||
|
||
await new Promise(resolve => setTimeout(resolve, 5000)); // Sleep for 5 seconds to avoid bot detection | ||
} | ||
} | ||
} | ||
|
||
// Click on the "Next" button and wait for the next page to load | ||
const nextButton = await page.$('[data-testid="pagination-page-next"]'); | ||
if (nextButton) { | ||
await Promise.all([ | ||
nextButton.click(), | ||
page.waitForNavigation({ waitUntil: 'networkidle0' }), | ||
]); | ||
} else { | ||
break; // Exit the loop if there is no "Next" button | ||
} | ||
} | ||
} catch (error) { | ||
const err = error as Error; | ||
let currentUrl: string = 'first page provided'; | ||
if (urls.length) { | ||
currentUrl = urls[urls.length - 1]; | ||
} | ||
throw new Error(`An error occurred at ${currentUrl}: ${err?.stack}`); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import { evaluateJobInterest } from '../src/interest'; | ||
|
||
describe('evaluateJobInterest', () => { | ||
it('should return null if any of the parameters are null', () => { | ||
expect(evaluateJobInterest(null, 'Company', 'Job Description')).toBeNull(); | ||
expect(evaluateJobInterest('Software Engineer', null, 'Job Description')).toBeNull(); | ||
expect(evaluateJobInterest('Software Engineer', 'Company', null)).toBeNull(); | ||
}); | ||
|
||
it('should return null if no included keywords are found', () => { | ||
expect(evaluateJobInterest('Software Engineer', 'Company', 'Job Description')).toBeNull(); | ||
}); | ||
|
||
it('should return the matched keyword if an included keyword is found and no excluded keywords are found', () => { | ||
expect(evaluateJobInterest('Software Engineer', 'Company', 'go')).toBe('go'); | ||
}); | ||
|
||
it('should return the null if an included keyword is part of another word', () => { | ||
expect(evaluateJobInterest('Software Engineer', 'Company', 'chicago')).toBeNull(); | ||
}); | ||
|
||
it('should return null if an included keyword and an excluded keyword are found', () => { | ||
expect(evaluateJobInterest('Software Engineer', 'Company', 'node contract')).toBeNull(); | ||
}); | ||
|
||
it('should return null if the job title does not match the include regex', () => { | ||
expect(evaluateJobInterest('Bad Job Title', 'Company', 'node')).toBeNull(); | ||
}); | ||
|
||
it('should return null if the job title matches the exclude regex', () => { | ||
expect(evaluateJobInterest('Front-End Developer', 'Company', 'node')).toBeNull(); | ||
}); | ||
|
||
it('should return null if the company name matches the exclude company regex', () => { | ||
expect(evaluateJobInterest('Software Engineer', 'Consulting', 'node')).toBeNull(); | ||
}); | ||
}); |
Oops, something went wrong.