Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

If page url doesn't change/or is a dup for 9 secs skip it. #28

Merged
merged 1 commit into from
Jun 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 26 additions & 20 deletions local/src/scraper/sitea.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,19 @@ import { evaluateJobInterest } from '../interest';
import { JobService, Job } from '../jobService';
const urls: string[] = [];

const didUrlChange = async (page: Page): Promise<boolean> => {
const isUrlADup = async (page: Page, counter: number = 0): Promise<boolean> => {

const url: string = page.url();
if (urls.includes(url)) {
// sleep for one second and check again
console.log('page url hasn\'t changed, sleeping for 3 second');
console.log('page url hasn\'t changed sleeping for 3 second');
await new Promise(resolve => setTimeout(resolve, 3000));
return didUrlChange(page);
counter++;
if (counter >= 3) {
console.log(`${page.url()} is a duplicate skipping to next job`);
return false; // If URL didn't change 3 times, return false
}
return isUrlADup(page, counter);
}

urls.push(url);
Expand Down Expand Up @@ -39,27 +44,28 @@ export const scrap = async(browser : Browser, link : string, jobCount : number,
await page.waitForSelector('#jobDescriptionText');

// subpage should have loaded by now confirm by checking the url
await didUrlChange(page);

const jobTitle = await page.evaluate(element => element.textContent, h2); // Get the h2 text
let companyName = null;
const isNonDupPosting = await isUrlADup(page);
if (isNonDupPosting) {
const jobTitle = await page.evaluate(element => element.textContent, h2); // Get the h2 text
let companyName = null;

// company isn't always set for w/e reason
try {
companyName = await container.$eval('[data-testid="company-name"]', element => element.textContent); // Get the company name
} catch (error) {
companyName = 'noCompanyFound';
}
// company isn't always set for w/e reason
try {
companyName = await container.$eval('[data-testid="company-name"]', element => element.textContent); // Get the company name
} catch (error) {
companyName = 'noCompanyFound';
}

const jobDescriptionText = await page.$eval('#jobDescriptionText', element => element.textContent);
const pageUrl = urls[urls.length - 1]; // Get the current page URL
const jobCategory = evaluateJobInterest(jobTitle, companyName, jobDescriptionText);
const jobDescriptionText = await page.$eval('#jobDescriptionText', element => element.textContent);
const pageUrl = urls[urls.length - 1]; // Get the current page URL
const jobCategory = evaluateJobInterest(jobTitle, companyName, jobDescriptionText);

if(jobCategory && jobTitle && companyName && pageUrl){
interestingJobs.push({ title: jobTitle, company: companyName, keyword: jobCategory, link: pageUrl });
if(jobCategory && jobTitle && companyName && pageUrl){
interestingJobs.push({ title: jobTitle, company: companyName, keyword: jobCategory, link: pageUrl });
}

await new Promise(resolve => setTimeout(resolve, 5000)); // Sleep for 5 seconds to avoid bot detection
}

await new Promise(resolve => setTimeout(resolve, 5000)); // Sleep for 5 seconds to avoid bot detection
}
}
}
Expand Down
Loading