Skip to content

Commit

Permalink
add logs for debugging and cleanup script
Browse files Browse the repository at this point in the history
  • Loading branch information
this-pama committed Aug 6, 2024
1 parent 9eed023 commit 05ee305
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 2 deletions.
12 changes: 10 additions & 2 deletions controllers/partial/extractData.js
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,10 @@ const extractDataFromUrl = async (driver, url, ignoreRelevanceCheck = false) =>
let content = ["blog", "publications"].includes(data.article_type) || $("body article").length > 0
? $("body article").text().trim()
: $("body").text().trim();

content = content.replace(/Skip to main content/g, ' ');
content = content.replace(/<iframe.*?<\/iframe>/g, '');

data.html_content = content;
data.content = content;
}
Expand All @@ -249,9 +253,13 @@ const extractDataFromUrl = async (driver, url, ignoreRelevanceCheck = false) =>
return null;
}

if(!data?.content?.length > 0){
console.log(' No content could be extracted. Url: ', url)
}

// Extract document meta
const [lang, location, meta] = await getDocumentMeta(data.content);
const iso3_b = await getIso3(url);
const [lang, location, meta] = data?.content ?? await getDocumentMeta(data.content);
const iso3_b = url ?? await getIso3(url);
if(!iso3_b && data.url.includes('/acceleratorlabs/')){
iso3_b = 'NUL' //Url matches Global network page.
}
Expand Down
65 changes: 65 additions & 0 deletions scripts/js/clean_content.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
require("dotenv").config();
const { DB } = require("../../db");

(async () => {
try {
const blogs = await DB.blog.any(
`
SELECT a.id, a.url, b.content
FROM articles a
JOIN article_content b ON b.article_id = a.id
JOIN article_html_content c ON c.article_id = a.id
JOIN raw_html d ON d.article_id = a.id
WHERE a.relevance > 1
AND (
b.content LIKE '%Skip to main content%'
OR b.content LIKE '%<iframe src="https://www.googletagmanager.com/ns.html?id=GTM-PHKHS2Q" height="0" width="0" style="display:none;visibility:hidden"></iframe>%'
)
ORDER BY a.id DESC
LIMIT 2
;
`
);

for (const bg of blogs) {
// Remove "Skip to main content"
content = bg.content.replace(/Skip to main content/g, '');

// Remove the specific iframe
content = bg.content.replace(/<iframe src="https:\/\/www\.googletagmanager\.com\/ns\.html\?id=GTM-PHKHS2Q" height="0" width="0" style="display:none;visibility:hidden"><\/iframe>/g, '');


console.log('main ', bg.url, )
// console.log('main ', content)
await DB.blog.tx(async (t) => {
await t.any(
`
UPDATE article_content
SET
content = $1
WHERE article_id = $2
`,
[content, bg.id]
);

await t.any(
`
UPDATE article_html_content
SET
html_content = $1
WHERE article_id = $2
`,
[content, bg.id]
);
})
.then(()=>{
console.log("Successfully updated content");
})
.catch(e => console.log(e));

// }
}
} catch (err) {
console.log("Error occurred: ", err);
}
})();
5 changes: 5 additions & 0 deletions services/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ exports.getDocumentMeta = async (content) => {
return [maxLanguage, maxConfidenceEntity, data];
} catch (error) {
console.error("Error:", error);
console.error("Input: ", body);
return [null, null, null];
}
};
Expand Down Expand Up @@ -187,6 +188,7 @@ exports.getDate = async (_kwarq) => {
return date;
} catch (error) {
console.error("Error:", error);
console.error("Input: ", body);
return null;
}
};
Expand Down Expand Up @@ -220,6 +222,8 @@ exports.embedDocument = async (id) => {

return console.log("Embedding successfully added");
} catch (error) {
console.error("Error: ", error);
console.error("Input: ", body);
throw Error(error);
}
};
Expand Down Expand Up @@ -254,6 +258,7 @@ exports.getIso3 = async (url) => {
return iso3;
} catch (error) {
console.error("Error:", error);
console.error("Input: ", body);
return null;
}
};

0 comments on commit 05ee305

Please sign in to comment.