Skip to content

Commit c36166e

Browse files
authored
fix: extract only body from iframe elements (#2986)
After the `iframe` element extraction, `parseWithCheerio` ends up with invalid HTML (e.g. `meta` or `title` elements in the parent `body` element). The proposed changes replace the `iframe` element with a `div` containing the inside of the `iframe`'s `body`. Closes #2979
1 parent b25497b commit c36166e

File tree

5 files changed

+33
-5
lines changed

5 files changed

+33
-5
lines changed

packages/playwright-crawler/src/internals/utils/playwright-utils.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -623,7 +623,15 @@ export async function parseWithCheerio(
623623
const iframe = await frame.contentFrame();
624624

625625
if (iframe) {
626-
const contents = await iframe.content();
626+
const getIframeHTML = async (): Promise<string> => {
627+
try {
628+
return iframe.locator('body').first().innerHTML();
629+
} catch {
630+
return iframe.content();
631+
}
632+
};
633+
634+
const contents = await getIframeHTML();
627635

628636
await frame.evaluate((f, c) => {
629637
const replacementNode = document.createElement('div');

packages/puppeteer-crawler/src/internals/utils/puppeteer_utils.ts

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,15 @@ export async function parseWithCheerio(
204204
try {
205205
const iframe = await frame.contentFrame();
206206
if (iframe) {
207-
const contents = await iframe.content();
207+
const getIframeHTML = async (): Promise<string> => {
208+
try {
209+
return iframe.$eval('body', (el) => el.innerHTML);
210+
} catch {
211+
return iframe.content();
212+
}
213+
};
214+
215+
const contents = await getIframeHTML();
208216

209217
await frame.evaluate((f, c) => {
210218
const replacementNode = document.createElement('div');

test/core/playwright_utils.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,14 @@ describe('playwrightUtils', () => {
170170
const $ = await playwrightUtils.parseWithCheerio(page);
171171

172172
const headings = $('h1')
173-
.map((i, el) => $(el).text())
173+
.map((_, el) => $(el).text())
174174
.get();
175+
176+
const titles = $('title')
177+
.map((_, el) => $(el).text())
178+
.get();
179+
180+
expect(titles).toEqual(['Outside iframe title']);
175181
expect(headings).toEqual(['Outside iframe', 'In iframe']);
176182
} finally {
177183
await browser.close();

test/core/puppeteer_utils.test.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,8 +171,14 @@ describe('puppeteerUtils', () => {
171171
const $ = await puppeteerUtils.parseWithCheerio(page);
172172

173173
const headings = $('h1')
174-
.map((i, el) => $(el).text())
174+
.map((_, el) => $(el).text())
175175
.get();
176+
177+
const titles = $('title')
178+
.map((_, el) => $(el).text())
179+
.get();
180+
181+
expect(titles).toEqual(['Outside iframe title']);
176182
expect(headings).toEqual(['Outside iframe', 'In iframe']);
177183
} finally {
178184
await browser.close();

test/shared/_helper.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ console.log('Hello world!');
176176
<!DOCTYPE html>
177177
<html>
178178
<head>
179-
<title>Outside iframe</title>
179+
<title>Outside iframe title</title>
180180
</head>
181181
<body>
182182
<h1>Outside iframe</h1>

0 commit comments

Comments
 (0)