Skip to content

Commit

Permalink
Merge pull request #19 from austin1237/local
Browse files Browse the repository at this point in the history
Local scraper
  • Loading branch information
austin1237 authored Apr 17, 2024
2 parents 3256334 + f3e0a8b commit 64c8988
Show file tree
Hide file tree
Showing 12 changed files with 6,160 additions and 1 deletion.
18 changes: 17 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,20 @@ jobs:
- name: Build proxy
run: go build -v ./...
working-directory: proxy


- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: v20.12.2

- name: Install local dependencies
run: npm install
working-directory: local

- name: build local typescript
run: npx tsc
working-directory: local

- name: test local
run: npm test
working-directory: local
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,8 @@ dist
#lambdas zip file
*.zip

#local ts build
/local/build
# output files for the local scraper
/local/*-jobs.txt

5 changes: 5 additions & 0 deletions local/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
};
5,802 changes: 5,802 additions & 0 deletions local/package-lock.json

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions local/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"scripts": {
"start": "npx tsc && node --enable-source-maps ./build/src/index.js",
"test": "jest"
},
"devDependencies": {
"@types/jest": "^29.5.12",
"jest": "^29.7.0",
"ts-jest": "^29.1.2",
"typescript": "^5.3.3"
},
"dependencies": {
"puppeteer": "^22.4.0"
}
}
11 changes: 11 additions & 0 deletions local/src/fileHandler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import fs from 'fs';

// Create a text document named with today's date
const date = new Date();

const fileName = `${date.getFullYear()}-${date.getMonth() + 1}-${date.getDate()}-jobs.txt`;

export function writeToFile(pageUrl: string, jobTitle: string | null, companyName: string | null, matchedKeyword: string) {
const fileContent = `${pageUrl}, ${jobTitle}, ${companyName}, ${matchedKeyword}\n`;
fs.appendFileSync(fileName, fileContent);
}
19 changes: 19 additions & 0 deletions local/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import puppeteer from 'puppeteer';
import { scrap } from './scraper/sitea';

const main = async () =>{
let jobCount = 0;
const link = process.env.LOCAL_SCRAPER_SITEA;
if (!link) {
console.error('LOCAL_SCRAPER_SITEA environment variable isnt set');
return;
}
const browser = await puppeteer.launch({ headless: false });
await scrap(browser, link, jobCount);
console.log('Scraping complete');
await browser.close();
}

main().catch(error => {
console.error(error);
});
47 changes: 47 additions & 0 deletions local/src/interest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
const includedKeywords = ["node", "nodejs", "node.js", "go", "golang", "typescript"];
const excludedKeywords = ["contract", "web3", "blockchain", "crypto"]
const includeTitles = ["software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer", "api"];
const excludeTitles = ["front-end", "front end", "frontend", ".net", "java", "manager", "lead", "staff", "principal", "contract", "c#", "microsoft"];
const includeRegex = new RegExp(includeTitles.join("|"), "i");
const excludeRegex = new RegExp(excludeTitles.join("|"), "i");
const excludeCompanyRegex = /(consulting|recruiting)/i;

export function evaluateJobInterest(jobTitle: string | null, companyName: string | null, jobDescriptionText: string | null ): string | null{

let matchedKeyword = null;
let matchedExcludedKeyword = null;

if (!companyName || !jobTitle || !jobDescriptionText){
return null
}

for (const keyword of includedKeywords) {
const regex = new RegExp("\\b" + keyword + "\\b", "i");
if (jobDescriptionText && regex.test(jobDescriptionText)) {
matchedKeyword = keyword;
break;
}
}

if (!matchedKeyword){
return null
}

for (const keyword of excludedKeywords) {
const regex = new RegExp("\\b" + keyword + "\\b", "i");
if (jobDescriptionText && regex.test(jobDescriptionText)) {
matchedExcludedKeyword = keyword;
break;
}
}

if (matchedExcludedKeyword) {
return null;
}

if(!includeRegex.test(jobTitle) || excludeRegex.test(jobTitle) || excludeCompanyRegex.test(companyName)){
return null
}

return matchedKeyword;
}
87 changes: 87 additions & 0 deletions local/src/scraper/sitea.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { Browser, Page } from 'puppeteer';
import { evaluateJobInterest } from '../interest';
import { writeToFile } from '../fileHandler';
const urls: string[] = [];

const didUrlChange = async (page: Page): Promise<boolean> => {

const url: string = page.url();
if (urls.includes(url)) {
// sleep for one second and check again
console.log('page url hasn\'t changed, sleeping for 3 second');
await new Promise(resolve => setTimeout(resolve, 3000));
return didUrlChange(page);
}

urls.push(url);
return true;
};


export const scrap = async(browser : Browser, link : string, jobCount : number) => {
try{
const page = await browser.newPage();
await page.goto(link);
await page.waitForSelector('.slider_container');

while (true) {
const containers = await page.$$('.slider_container');
for (const container of containers) {
const h2 = await container.$('h2');
if (h2) {
await h2.click();
const className = await page.evaluate(element => element.className, h2);
if (className.startsWith('jobTitle')) {

// Waiting for the new job to load on the subpage
await page.waitForSelector('.slider_container', { visible: true });
await page.waitForSelector('#jobDescriptionText');

// subpage should have loaded by now confirm by checking the url
await didUrlChange(page);

const jobTitle = await page.evaluate(element => element.textContent, h2); // Get the h2 text
let companyName = null;

// company isn't always set for w/e reason
try {
companyName = await container.$eval('[data-testid="company-name"]', element => element.textContent); // Get the company name
} catch (error) {
companyName = 'noCompanyFound';
}

const jobDescriptionText = await page.$eval('#jobDescriptionText', element => element.textContent);
const pageUrl = urls[urls.length - 1]; // Get the current page URL
const jobCategory = evaluateJobInterest(jobTitle, companyName, jobDescriptionText);

if(jobCategory){
writeToFile(pageUrl, jobTitle, companyName, jobCategory);
jobCount++;
console.log(`Job found ${jobCount}`);
}

await new Promise(resolve => setTimeout(resolve, 5000)); // Sleep for 5 seconds to avoid bot detection
}
}
}

// Click on the "Next" button and wait for the next page to load
const nextButton = await page.$('[data-testid="pagination-page-next"]');
if (nextButton) {
await Promise.all([
nextButton.click(),
page.waitForNavigation({ waitUntil: 'networkidle0' }),
]);
} else {
break; // Exit the loop if there is no "Next" button
}
}
} catch (error) {
const err = error as Error;
let currentUrl: string = 'first page provided';
if (urls.length) {
currentUrl = urls[urls.length - 1];
}
throw new Error(`An error occurred at ${currentUrl}: ${err?.stack}`);
}
}
37 changes: 37 additions & 0 deletions local/tests/interest.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { evaluateJobInterest } from '../src/interest';

describe('evaluateJobInterest', () => {
it('should return null if any of the parameters are null', () => {
expect(evaluateJobInterest(null, 'Company', 'Job Description')).toBeNull();
expect(evaluateJobInterest('Software Engineer', null, 'Job Description')).toBeNull();
expect(evaluateJobInterest('Software Engineer', 'Company', null)).toBeNull();
});

it('should return null if no included keywords are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'Job Description')).toBeNull();
});

it('should return the matched keyword if an included keyword is found and no excluded keywords are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'go')).toBe('go');
});

it('should return the null if an included keyword is part of another word', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'chicago')).toBeNull();
});

it('should return null if an included keyword and an excluded keyword are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'node contract')).toBeNull();
});

it('should return null if the job title does not match the include regex', () => {
expect(evaluateJobInterest('Bad Job Title', 'Company', 'node')).toBeNull();
});

it('should return null if the job title matches the exclude regex', () => {
expect(evaluateJobInterest('Front-End Developer', 'Company', 'node')).toBeNull();
});

it('should return null if the company name matches the exclude company regex', () => {
expect(evaluateJobInterest('Software Engineer', 'Consulting', 'node')).toBeNull();
});
});
Loading

0 comments on commit 64c8988

Please sign in to comment.