Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add PDF screenshot generation and display #995

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
9734ea1
Updated pdf2json to 3.1.5
AhmadMuj Feb 5, 2025
01173c4
Extract and store a screenshot from PDF files using pdf2pic
AhmadMuj Feb 8, 2025
c9b3892
Installing graphicsmagick and ghostscript
AhmadMuj Feb 8, 2025
403f873
Generate Missing PDF screenshot with tidyAssets worker for backward s…
AhmadMuj Feb 8, 2025
b370b57
Display PDF screenshot instead of the PDF in web if it exists.
AhmadMuj Feb 8, 2025
4dae477
Display PDF screenshot in mobile app if exists.
AhmadMuj Feb 8, 2025
b3c857a
Updated pnpm-lock.yaml
AhmadMuj Feb 8, 2025
5caa9a1
Removed console.log
AhmadMuj Feb 9, 2025
6d59073
Revert the unnecessary changes in package.json
AhmadMuj Feb 9, 2025
050778e
Revert pnpm-lock changes
AhmadMuj Feb 9, 2025
57c3072
Prevent rendering PDF files if the screenshot is not generated
AhmadMuj Feb 9, 2025
235de11
refactor: replace useEffect with useMemo for section initialization
AhmadMuj Feb 15, 2025
f2aef9d
feat: show PDF file download button and handle large PDFs by defaulti…
AhmadMuj Feb 15, 2025
bdb4863
feat: add file size to openapi spec
AhmadMuj Feb 15, 2025
0884764
feature: Add Assets preprocessing in fix mode to admin actions
AhmadMuj Feb 15, 2025
fbfc823
i18n: add reprocess_assets_fix_mode translation
AhmadMuj Feb 15, 2025
b6bd4f7
Merge branch 'main' into feat/pdf-thumbnail-preprocessing
AhmadMuj Feb 15, 2025
744ef20
i18n: Add missing ar translations
AhmadMuj Feb 15, 2025
34888d5
A bunch of fixes
MohamedBassem Feb 16, 2025
5eade27
Fix openspec schema
MohamedBassem Feb 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion apps/mobile/components/bookmarks/BookmarkCard.tsx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import React from "react";
import {
ActivityIndicator,
Alert,
Expand Down Expand Up @@ -300,11 +301,21 @@ function AssetCard({
}
const title = bookmark.title ?? bookmark.content.fileName;

let assetImage = bookmark.content.assetId;

const screenshot = bookmark.assets.find(
(item) => item.assetType === "screenshot",
);

if (screenshot) {
assetImage = screenshot.id;
}

return (
<View className="flex gap-2">
<Pressable onPress={onOpenBookmark}>
<BookmarkAssetImage
assetId={bookmark.content.assetId}
assetId={assetImage}
className="h-56 min-h-56 w-full object-cover"
/>
</Pressable>
Expand Down
46 changes: 39 additions & 7 deletions apps/web/components/dashboard/bookmarks/AssetCard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ import Link from "next/link";

import type { ZBookmarkTypeAsset } from "@hoarder/shared/types/bookmarks";
import { getAssetUrl } from "@hoarder/shared-react/utils/assetUtils";
import { getSourceUrl } from "@hoarder/shared-react/utils/bookmarkUtils";
import {
getSourceUrl,
isBookmarkStillTagging,
} from "@hoarder/shared-react/utils/bookmarkUtils";

import { BookmarkLayoutAdaptingCard } from "./BookmarkLayoutAdaptingCard";
import FooterLinkURL from "./FooterLinkURL";
Expand All @@ -32,13 +35,42 @@ function AssetImage({
);
}
case "pdf": {
return (
<iframe
title={bookmarkedAsset.assetId}
className={className}
src={getAssetUrl(bookmarkedAsset.assetId)}
/>
const screenshot = bookmark.assets.find(
(item) => item.assetType === "screenshot",
);
if (screenshot) {
return (
<Link href={`/dashboard/preview/${bookmark.id}`}>
<Image
alt="asset"
src={getAssetUrl(screenshot.id)}
fill={true}
className={className}
/>
</Link>
);
} else {
return (
<div>
<div className="mb-2 text-red-400">
{!isBookmarkStillTagging(bookmark) && (
<p className="m-2">
Please contact your administrator to perform a compact assets
action to generate PDF screenshots by visiting{" "}
<Link href="/admin/actions" className="underline">
admin/actions
</Link>
</p>
)}
</div>
<iframe
title={bookmarkedAsset.assetId}
className={className}
src={getAssetUrl(bookmarkedAsset.assetId)}
/>
</div>
);
}
}
default: {
const _exhaustiveCheck: never = bookmarkedAsset.assetType;
Expand Down
94 changes: 72 additions & 22 deletions apps/web/components/dashboard/preview/AssetContentSection.tsx
Original file line number Diff line number Diff line change
@@ -1,42 +1,92 @@
import { useEffect, useState } from "react";
import Image from "next/image";
import Link from "next/link";
import {
Select,
SelectContent,
SelectGroup,
SelectItem,
SelectTrigger,
SelectValue,
} from "@/components/ui/select";

import { BookmarkTypes, ZBookmark } from "@hoarder/shared/types/bookmarks";

export function AssetContentSection({ bookmark }: { bookmark: ZBookmark }) {
const [section, setSection] = useState<string>("");

useEffect(() => {
const screenshot = bookmark.assets.find(
(item) => item.assetType === "screenshot",
);
if (screenshot) {
setSection("screenshot");
} else {
setSection("pdf");
}
}, [bookmark]);

if (bookmark.content.type != BookmarkTypes.ASSET) {
throw new Error("Invalid content type");
}

switch (bookmark.content.assetType) {
case "image": {
return (
if (bookmark.content.assetType === "image") {
return (
<div className="relative h-full min-w-full">
<Link href={`/api/assets/${bookmark.content.assetId}`} target="_blank">
<Image
alt="asset"
fill={true}
className="object-contain"
src={`/api/assets/${bookmark.content.assetId}`}
/>
</Link>
</div>
);
}

if (bookmark.content.assetType === "pdf") {
const screenshot = bookmark.assets.find(
(item) => item.assetType === "screenshot",
);

const content =
section === "screenshot" && screenshot ? (
<div className="relative h-full min-w-full">
<Link
href={`/api/assets/${bookmark.content.assetId}`}
target="_blank"
>
<Image
alt="asset"
fill={true}
className="object-contain"
src={`/api/assets/${bookmark.content.assetId}`}
/>
</Link>
<Image
alt="screenshot"
src={`/api/assets/${screenshot.id}`}
fill={true}
className="object-contain"
/>
</div>
);
}
case "pdf": {
return (
) : (
<iframe
title={bookmark.content.assetId}
className="h-full w-full"
src={`/api/assets/${bookmark.content.assetId}`}
/>
);
}
default: {
return <div>Unsupported asset type</div>;
}

return (
<div className="flex h-full flex-col items-center gap-2">
<Select onValueChange={setSection} value={section}>
<SelectTrigger className="w-fit">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectGroup>
<SelectItem value="screenshot" disabled={!screenshot}>
Screenshot
</SelectItem>
<SelectItem value="pdf">PDF</SelectItem>
</SelectGroup>
</SelectContent>
</Select>
{content}
</div>
);
}

return <div>Unsupported asset type</div>;
}
95 changes: 85 additions & 10 deletions apps/workers/assetPreprocessingWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@ import os from "os";
import { eq } from "drizzle-orm";
import { DequeuedJob, Runner } from "liteque";
import PDFParser from "pdf2json";
import { fromBuffer } from "pdf2pic";
import { createWorker } from "tesseract.js";

import type { AssetPreprocessingRequest } from "@hoarder/shared/queues";
import { db } from "@hoarder/db";
import { bookmarkAssets, bookmarks } from "@hoarder/db/schema";
import {
assets,
AssetTypes,
bookmarkAssets,
bookmarks,
} from "@hoarder/db/schema";
import { readAsset } from "@hoarder/shared/assetdb";
import serverConfig from "@hoarder/shared/config";
import logger from "@hoarder/shared/logger";
Expand All @@ -16,6 +22,8 @@ import {
triggerSearchReindex,
} from "@hoarder/shared/queues";

import { storeScreenshot } from "./crawlerWorker";

export class AssetPreprocessingWorker {
static build() {
logger.info("Starting asset preprocessing worker ...");
Expand Down Expand Up @@ -67,24 +75,75 @@ async function readImageText(buffer: Buffer) {

async function readPDFText(buffer: Buffer): Promise<{
text: string;
metadata: Record<string, string>;
metadata: Record<string, object>;
}> {
return new Promise((resolve, reject) => {
// Need raw text flag represents as number (1), reference : https://github.com/modesty/pdf2json/issues/76#issuecomment-236569265
const pdfParser = new PDFParser(null, 1);
const pdfParser = new PDFParser(null, true);
pdfParser.on("pdfParser_dataError", reject);
pdfParser.on("pdfParser_dataReady", (pdfData) => {
resolve({
// The type isn't set correctly, reference : https://github.com/modesty/pdf2json/issues/327
// eslint-disable-next-line
text: (pdfParser as any).getRawTextContent(),
text: pdfParser.getRawTextContent(),
metadata: pdfData.Meta,
});
});
pdfParser.parseBuffer(buffer);
});
}

export async function extractAndSavePDFScreenshot(
pdfBuffer: Buffer,
userId: string,
bookmarkId: string,
jobId: string,
): Promise<boolean> {
try {
console.log("extracting =================================");
logger.info(
`[${jobId}] Attempting to generate PDF screenshot for bookmarkId: ${bookmarkId}`,
);
/**
* If you encountered any issues with this library, make sure you have ghostscript and graphicsmagick installed following this URL
* https://github.com/yakovmeister/pdf2image/blob/HEAD/docs/gm-installation.md
*/
const screenshot = await fromBuffer(pdfBuffer, {
density: 100,
quality: 100,
format: "png",
preserveAspectRatio: true,
})(1, { responseType: "buffer" });

if (!screenshot.buffer) {
logger.error(`[${jobId}] Failed to generate PDF screenshot`);
return false;
}

// Store the screenshot
const asset = await storeScreenshot(screenshot.buffer, userId, jobId);

if (!asset) {
logger.error(`[${jobId}] Failed to store PDF screenshot`);
return false;
}

// Insert into database
await db.insert(assets).values({
id: asset.assetId,
bookmarkId,
userId,
assetType: AssetTypes.LINK_SCREENSHOT,
contentType: asset.contentType,
size: asset.size,
fileName: asset.fileName,
});

logger.info(`[${jobId}] Successfully saved PDF screenshot to database`);
return true;
} catch (error) {
logger.error(`[${jobId}] Failed to process PDF screenshot: ${error}`);
return false;
}
}

async function preprocessImage(
jobId: string,
asset: Buffer,
Expand All @@ -110,7 +169,13 @@ async function preprocessImage(
async function preProcessPDF(
jobId: string,
asset: Buffer,
): Promise<{ content: string; metadata: string | null } | undefined> {
): Promise<
| {
content: string;
metadata: string | null;
}
| undefined
> {
const pdfParse = await readPDFText(asset);
if (!pdfParse?.text) {
throw new Error(
Expand Down Expand Up @@ -162,15 +227,25 @@ async function run(req: DequeuedJob<AssetPreprocessingRequest>) {
);
}

let result: { content: string; metadata: string | null } | undefined =
undefined;
let result:
| {
content: string;
metadata: string | null;
}
| undefined = undefined;

switch (bookmark.asset.assetType) {
case "image":
result = await preprocessImage(jobId, asset);
break;
case "pdf":
result = await preProcessPDF(jobId, asset);
await extractAndSavePDFScreenshot(
asset,
bookmark.userId,
bookmarkId,
jobId,
);
break;
default:
throw new Error(
Expand Down
2 changes: 1 addition & 1 deletion apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ function extractReadableContent(
return readableContent;
}

async function storeScreenshot(
export async function storeScreenshot(
screenshot: Buffer | undefined,
userId: string,
jobId: string,
Expand Down
Loading