Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local scraper #19

Merged
merged 2 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,20 @@ jobs:
- name: Build proxy
run: go build -v ./...
working-directory: proxy


- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: v20.12.2

- name: Install local dependencies
run: npm install
working-directory: local

- name: build local typescript
run: npx tsc
working-directory: local

- name: test local
run: npm test
working-directory: local
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,8 @@ dist
#lambdas zip file
*.zip

#local ts build
/local/build
# output files for the local scraper
/local/*-jobs.txt

5 changes: 5 additions & 0 deletions local/jest.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/** @type {import('ts-jest').JestConfigWithTsJest} */
module.exports = {
preset: 'ts-jest',
testEnvironment: 'node',
};
5,802 changes: 5,802 additions & 0 deletions local/package-lock.json

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions local/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"scripts": {
"start": "npx tsc && node --enable-source-maps ./build/src/index.js",
"test": "jest"
},
"devDependencies": {
"@types/jest": "^29.5.12",
"jest": "^29.7.0",
"ts-jest": "^29.1.2",
"typescript": "^5.3.3"
},
"dependencies": {
"puppeteer": "^22.4.0"
}
}
11 changes: 11 additions & 0 deletions local/src/fileHandler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import fs from 'fs';

// Create a text document named with today's date
const date = new Date();

const fileName = `${date.getFullYear()}-${date.getMonth() + 1}-${date.getDate()}-jobs.txt`;

export function writeToFile(pageUrl: string, jobTitle: string | null, companyName: string | null, matchedKeyword: string) {
const fileContent = `${pageUrl}, ${jobTitle}, ${companyName}, ${matchedKeyword}\n`;
fs.appendFileSync(fileName, fileContent);
}
19 changes: 19 additions & 0 deletions local/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import puppeteer from 'puppeteer';
import { scrap } from './scraper/sitea';

const main = async () =>{
let jobCount = 0;
const link = process.env.LOCAL_SCRAPER_SITEA;
if (!link) {
console.error('LOCAL_SCRAPER_SITEA environment variable isnt set');
return;
}
const browser = await puppeteer.launch({ headless: false });
await scrap(browser, link, jobCount);
console.log('Scraping complete');
await browser.close();
}

main().catch(error => {
console.error(error);
});
47 changes: 47 additions & 0 deletions local/src/interest.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
const includedKeywords = ["node", "nodejs", "node.js", "go", "golang", "typescript"];
const excludedKeywords = ["contract", "web3", "blockchain", "crypto"]
const includeTitles = ["software engineer", "developer", "backend engineer", "backend developer", "backend", "software developer", "api"];
const excludeTitles = ["front-end", "front end", "frontend", ".net", "java", "manager", "lead", "staff", "principal", "contract", "c#", "microsoft"];
const includeRegex = new RegExp(includeTitles.join("|"), "i");
const excludeRegex = new RegExp(excludeTitles.join("|"), "i");
const excludeCompanyRegex = /(consulting|recruiting)/i;

export function evaluateJobInterest(jobTitle: string | null, companyName: string | null, jobDescriptionText: string | null ): string | null{

let matchedKeyword = null;
let matchedExcludedKeyword = null;

if (!companyName || !jobTitle || !jobDescriptionText){
return null
}

for (const keyword of includedKeywords) {
const regex = new RegExp("\\b" + keyword + "\\b", "i");
if (jobDescriptionText && regex.test(jobDescriptionText)) {
matchedKeyword = keyword;
break;
}
}

if (!matchedKeyword){
return null
}

for (const keyword of excludedKeywords) {
const regex = new RegExp("\\b" + keyword + "\\b", "i");
if (jobDescriptionText && regex.test(jobDescriptionText)) {
matchedExcludedKeyword = keyword;
break;
}
}

if (matchedExcludedKeyword) {
return null;
}

if(!includeRegex.test(jobTitle) || excludeRegex.test(jobTitle) || excludeCompanyRegex.test(companyName)){
return null
}

return matchedKeyword;
}
87 changes: 87 additions & 0 deletions local/src/scraper/sitea.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { Browser, Page } from 'puppeteer';
import { evaluateJobInterest } from '../interest';
import { writeToFile } from '../fileHandler';
const urls: string[] = [];

const didUrlChange = async (page: Page): Promise<boolean> => {

const url: string = page.url();
if (urls.includes(url)) {
// sleep for one second and check again
console.log('page url hasn\'t changed, sleeping for 3 second');
await new Promise(resolve => setTimeout(resolve, 3000));
return didUrlChange(page);
}

urls.push(url);
return true;
};


export const scrap = async(browser : Browser, link : string, jobCount : number) => {
try{
const page = await browser.newPage();
await page.goto(link);
await page.waitForSelector('.slider_container');

while (true) {
const containers = await page.$$('.slider_container');
for (const container of containers) {
const h2 = await container.$('h2');
if (h2) {
await h2.click();
const className = await page.evaluate(element => element.className, h2);
if (className.startsWith('jobTitle')) {

// Waiting for the new job to load on the subpage
await page.waitForSelector('.slider_container', { visible: true });
await page.waitForSelector('#jobDescriptionText');

// subpage should have loaded by now confirm by checking the url
await didUrlChange(page);

const jobTitle = await page.evaluate(element => element.textContent, h2); // Get the h2 text
let companyName = null;

// company isn't always set for w/e reason
try {
companyName = await container.$eval('[data-testid="company-name"]', element => element.textContent); // Get the company name
} catch (error) {
companyName = 'noCompanyFound';
}

const jobDescriptionText = await page.$eval('#jobDescriptionText', element => element.textContent);
const pageUrl = urls[urls.length - 1]; // Get the current page URL
const jobCategory = evaluateJobInterest(jobTitle, companyName, jobDescriptionText);

if(jobCategory){
writeToFile(pageUrl, jobTitle, companyName, jobCategory);
jobCount++;
console.log(`Job found ${jobCount}`);
}

await new Promise(resolve => setTimeout(resolve, 5000)); // Sleep for 5 seconds to avoid bot detection
}
}
}

// Click on the "Next" button and wait for the next page to load
const nextButton = await page.$('[data-testid="pagination-page-next"]');
if (nextButton) {
await Promise.all([
nextButton.click(),
page.waitForNavigation({ waitUntil: 'networkidle0' }),
]);
} else {
break; // Exit the loop if there is no "Next" button
}
}
} catch (error) {
const err = error as Error;
let currentUrl: string = 'first page provided';
if (urls.length) {
currentUrl = urls[urls.length - 1];
}
throw new Error(`An error occurred at ${currentUrl}: ${err?.stack}`);
}
}
37 changes: 37 additions & 0 deletions local/tests/interest.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import { evaluateJobInterest } from '../src/interest';

describe('evaluateJobInterest', () => {
it('should return null if any of the parameters are null', () => {
expect(evaluateJobInterest(null, 'Company', 'Job Description')).toBeNull();
expect(evaluateJobInterest('Software Engineer', null, 'Job Description')).toBeNull();
expect(evaluateJobInterest('Software Engineer', 'Company', null)).toBeNull();
});

it('should return null if no included keywords are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'Job Description')).toBeNull();
});

it('should return the matched keyword if an included keyword is found and no excluded keywords are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'go')).toBe('go');
});

it('should return the null if an included keyword is part of another word', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'chicago')).toBeNull();
});

it('should return null if an included keyword and an excluded keyword are found', () => {
expect(evaluateJobInterest('Software Engineer', 'Company', 'node contract')).toBeNull();
});

it('should return null if the job title does not match the include regex', () => {
expect(evaluateJobInterest('Bad Job Title', 'Company', 'node')).toBeNull();
});

it('should return null if the job title matches the exclude regex', () => {
expect(evaluateJobInterest('Front-End Developer', 'Company', 'node')).toBeNull();
});

it('should return null if the company name matches the exclude company regex', () => {
expect(evaluateJobInterest('Software Engineer', 'Consulting', 'node')).toBeNull();
});
});
Loading
Loading