diff --git a/controllers/partial/extractData.js b/controllers/partial/extractData.js index 5573bca..7633144 100644 --- a/controllers/partial/extractData.js +++ b/controllers/partial/extractData.js @@ -240,6 +240,10 @@ const extractDataFromUrl = async (driver, url, ignoreRelevanceCheck = false) => let content = ["blog", "publications"].includes(data.article_type) || $("body article").length > 0 ? $("body article").text().trim() : $("body").text().trim(); + + content = content.replace(/Skip to main content/g, ' '); + content = content.replace(//g, ''); + data.html_content = content; data.content = content; } @@ -249,9 +253,13 @@ const extractDataFromUrl = async (driver, url, ignoreRelevanceCheck = false) => return null; } + if(!data?.content?.length > 0){ + console.log(' No content could be extracted. Url: ', url) + } + // Extract document meta - const [lang, location, meta] = await getDocumentMeta(data.content); - const iso3_b = await getIso3(url); + const [lang, location, meta] = data?.content ?? await getDocumentMeta(data.content); + const iso3_b = url ?? await getIso3(url); if(!iso3_b && data.url.includes('/acceleratorlabs/')){ iso3_b = 'NUL' //Url matches Global network page. } diff --git a/scripts/js/clean_content.js b/scripts/js/clean_content.js new file mode 100644 index 0000000..74083d4 --- /dev/null +++ b/scripts/js/clean_content.js @@ -0,0 +1,65 @@ +require("dotenv").config(); +const { DB } = require("../../db"); + +(async () => { + try { + const blogs = await DB.blog.any( + ` + SELECT a.id, a.url, b.content + FROM articles a + JOIN article_content b ON b.article_id = a.id + JOIN article_html_content c ON c.article_id = a.id + JOIN raw_html d ON d.article_id = a.id + WHERE a.relevance > 1 + AND ( + b.content LIKE '%Skip to main content%' + OR b.content LIKE '%%' + ) + ORDER BY a.id DESC + LIMIT 2 + ; + ` + ); + + for (const bg of blogs) { + // Remove "Skip to main content" + content = bg.content.replace(/Skip to main content/g, ''); + + // Remove the specific iframe + content = bg.content.replace(/