From 9308ec3b2f7538ac6d3dfd10b6fd126b4321aba9 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Tue, 19 Nov 2024 18:14:01 -0800 Subject: [PATCH] Fix for FB video replay (#212) - New fuzzy matching rules for FB, matching and rewriting embedded DASH (again) - New ruleset: range-as-query args configured per domain, allows lookup of range requests set via query args - APIs: add hasRangeAsQuery() and removeRangeAsQuery() to detect if URL has range embedded in the query args - Part of fix for [Bug]: web-recorder can not detect facebook video webrecorder/archiveweb.page#272 --- package.json | 2 +- src/collection.ts | 2 +- src/index.ts | 2 ++ src/response.ts | 33 ++++++++++---------- src/rewrite/dsruleset.ts | 65 +++++++++++++++++++++++++++++++++++++--- src/rewrite/index.ts | 26 +++++++++++++++- test/rewriteVideo.ts | 2 ++ yarn.lock | 14 +++++++++ 8 files changed, 123 insertions(+), 23 deletions(-) diff --git a/package.json b/package.json index 044a5f47..1a5afbe9 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,7 @@ "path-parser": "^6.1.0", "process": "^0.11.10", "stream-browserify": "^3.0.0", - "warcio": "^2.4.0" + "warcio": "^2.4.2" }, "devDependencies": { "@swc-node/register": "^1.10.9", diff --git a/src/collection.ts b/src/collection.ts index 7e71f846..789b2ffb 100644 --- a/src/collection.ts +++ b/src/collection.ts @@ -1,4 +1,4 @@ -import { Rewriter } from "./rewrite/index"; +import { Rewriter } from "./rewrite"; import { getTS, diff --git a/src/index.ts b/src/index.ts index b2871f46..7998a381 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,6 +5,8 @@ export { Rewriter, } from "./rewrite"; +export { removeRangeAsQuery, hasRangeAsQuery } from "./rewrite/dsruleset"; + export { ArchiveRequest } from "./request"; export { ArchiveResponse } from "./response"; diff --git a/src/response.ts b/src/response.ts index 5d6f9351..b8118a12 100644 --- a/src/response.ts +++ b/src/response.ts @@ -1,4 +1,4 @@ -import { BaseAsyncIterReader, AsyncIterReader, LimitReader } from "warcio"; +import { BaseAsyncIterReader, AsyncIterReader } from "warcio"; import { isNullBodyStatus, decodeLatin1, @@ -300,21 +300,8 @@ class ArchiveResponse { const start = Number(bytes[1]); const end = Number(bytes[2]) || length - 1; - if (this.buffer) { - this.buffer = this.buffer.slice(start, end + 1); - } else if (this.reader) { - // [TODO] - // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition - if (!(this.reader instanceof LimitReader) || !this.reader.setLimitSkip) { - return false; - } - if (start !== 0 || end !== length - 1) { - this.reader.setLimitSkip(end - start + 1, start); - } - //TODO - // } else if (this.reader.setRangeAll) { - // this.reader.setRangeAll(length); - // } + if (!this.setRawRange(start, end)) { + return false; } this.headers.set("Content-Range", `bytes ${start}-${end}/${length}`); @@ -326,6 +313,20 @@ class ArchiveResponse { return true; } + setRawRange(start: number, end: number) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const reader = this.reader as any; + if (this.buffer) { + this.buffer = this.buffer.slice(start, end + 1); + return true; + } else if (reader?.setLimitSkip) { + reader.setLimitSkip(end - start + 1, start); + return true; + } + + return false; + } + makeResponse(coHeaders = false, overwriteDisposition = false) { let body: Uint8Array | ReadableStream | null = null; if (!isNullBodyStatus(this.status)) { diff --git a/src/rewrite/dsruleset.ts b/src/rewrite/dsruleset.ts index 84958fd1..cebcc54f 100644 --- a/src/rewrite/dsruleset.ts +++ b/src/rewrite/dsruleset.ts @@ -1,3 +1,4 @@ +import { rewriteDASH } from "./rewriteVideo"; import { type RxRewriter, type Rule } from "./rxrewriter"; //import unescapeJs from "unescape-js"; @@ -49,10 +50,10 @@ export const DEFAULT_RULES: Rules[] = [ { contains: ["facebook.com/", "fbsbx.com/"], rxRules: [ - //[/"dash_prefetch_experimental.*"playlist".*?(?=["][,]["]dash)/, ruleRewriteFBDash], - [/"dash_/, ruleReplace('"__nodash__')], - [/_dash"/, ruleReplace('__nodash__"')], - [/_dash_/, ruleReplace("__nodash__")], + [/"dash_manifests.*?,"failure_reason":null}]/, ruleRewriteFBDash], + //[/"dash_/, ruleReplace('"__nodash__')], + //[/_dash"/, ruleReplace('__nodash__"')], + //[/_dash_/, ruleReplace("__nodash__")], [/"playlist/, ruleReplace('"__playlist__')], [ /"debugNoBatching\s?":(?:false|0)/, @@ -124,6 +125,62 @@ export const HTML_ONLY_RULES: Rules[] = [ ...DEFAULT_RULES, ]; +const RANGE_RULES = [ + { + contains: /video.*fbcdn.net/, + start: "bytestart", + end: "byteend", + }, +]; + +export function hasRangeAsQuery(url: string) { + if (!url) { + return null; + } + for (const rule of RANGE_RULES) { + const { contains, start, end } = rule; + if (url.match(contains)) { + return { start, end }; + } + } + + return null; +} + +export function removeRangeAsQuery(url: string) { + const result = hasRangeAsQuery(url); + if (!result) { + return null; + } + try { + const parsedUrl = new URL(url); + if ( + !parsedUrl.searchParams.has(result.start) || + !parsedUrl.searchParams.has(result.end) + ) { + return null; + } + parsedUrl.searchParams.delete(result.start); + parsedUrl.searchParams.delete(result.end); + return parsedUrl.href; + } catch (_e) { + return null; + } +} + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export function ruleRewriteFBDash(text: string, opts: Record) { + const start = text.indexOf("\\u003C?xml"); + const end = text.indexOf("\\u003C\\/MPD>", start) + "\\u003C\\/MPD>".length; + const rwtext: string = JSON.parse('"' + text.slice(start, end) + '"'); + + let rw = rewriteDASH(rwtext, opts); + + rw = JSON.stringify(rw).replaceAll("<", "\\u003C").slice(1, -1); + + return text.slice(0, start) + rw + text.slice(end); +} + // =========================================================================== function ruleReplace(str: string) { return (x: string) => str.replace("{0}", x); diff --git a/src/rewrite/index.ts b/src/rewrite/index.ts index dae4d213..b319edfd 100644 --- a/src/rewrite/index.ts +++ b/src/rewrite/index.ts @@ -6,7 +6,11 @@ import { decodeResponse } from "./decoder"; import { rewriteDASH, rewriteHLS } from "./rewriteVideo"; -import { DomainSpecificRuleSet, HTML_ONLY_RULES } from "./dsruleset"; +import { + DomainSpecificRuleSet, + hasRangeAsQuery, + HTML_ONLY_RULES, +} from "./dsruleset"; import { RxRewriter } from "./rxrewriter"; import { JSRewriter } from "./jsrewriter"; @@ -322,6 +326,26 @@ export class Rewriter { this.isCharsetUTF8 = true; } response.setText(text, this.isCharsetUTF8); + } else { + // check range-as-query + const result = hasRangeAsQuery(request.url); + if (result) { + const url = new URL(request.url); + const start = parseInt(url.searchParams.get(result.start) || ""); + const end = parseInt(url.searchParams.get(result.end) || ""); + if (!isNaN(start) && !isNaN(end)) { + const existingLen = Number(response.headers.get("Content-Length")); + const newLen = end - start + 1; + if ( + existingLen !== newLen && + (isNaN(existingLen) || existingLen > newLen) && + response.setRawRange(start, end) + ) { + console.log("setting range", start, end, newLen); + response.headers.set("Content-Length", String(newLen)); + } + } + } } return response; diff --git a/test/rewriteVideo.ts b/test/rewriteVideo.ts index 2e1d3b08..b3f847d3 100644 --- a/test/rewriteVideo.ts +++ b/test/rewriteVideo.ts @@ -208,6 +208,7 @@ const test4 = ytplayer.config.args.dash = "0"; ytplayer.config.args.dashmpd = "" t.is(result, expected, result); }); +/* test("FB rewrite JS", async (t) => { const content = `\