Generate a llms-full version of the docs (#3285)

SamyPesse · web-flow · commit 33726c88a04f · 2025-06-08T12:13:14.000+02:00
diff --git a/.changeset/orange-hounds-sparkle.md b/.changeset/orange-hounds-sparkle.md
@@ -0,0 +1,5 @@
+---
+"gitbook-v2": patch
+---
+
+Generate a llms-full.txt version of the docs site
diff --git a/bun.lock b/bun.lock
diff --git a/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts b/packages/gitbook-v2/src/app/sites/static/[mode]/[siteURL]/[siteData]/llms-full.txt/route.ts
@@ -0,0 +1,14 @@
+import type { NextRequest } from 'next/server';
+
+import { serveLLMsFullTxt } from '@/routes/llms-full';
+import { type RouteLayoutParams, getStaticSiteContext } from '@v2/app/utils';
+
+export const dynamic = 'force-static';
+
+export async function GET(
+    _request: NextRequest,
+    { params }: { params: Promise<RouteLayoutParams> }
+) {
+    const { context } = await getStaticSiteContext(await params);
+    return serveLLMsFullTxt(context);
+}
diff --git a/packages/gitbook/e2e/internal.spec.ts b/packages/gitbook/e2e/internal.spec.ts
@@ -431,6 +431,38 @@ const testCases: TestsCase[] = [
             },
         ],
     },
+    {
+        name: 'llms.txt',
+        skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',
+        contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/',
+        tests: [
+            {
+                name: 'llms.txt',
+                url: 'llms.txt',
+                screenshot: false,
+                run: async (_page, response) => {
+                    expect(response?.status()).toBe(200);
+                    expect(response?.headers()['content-type']).toContain('text/markdown');
+                },
+            },
+        ],
+    },
+    {
+        name: 'llms-full.txt',
+        skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',
+        contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/',
+        tests: [
+            {
+                name: 'llms-full.txt',
+                url: 'llms-full.txt',
+                screenshot: false,
+                run: async (_page, response) => {
+                    expect(response?.status()).toBe(200);
+                    expect(response?.headers()['content-type']).toContain('text/markdown');
+                },
+            },
+        ],
+    },
     {
         name: 'Site subdirectory (proxy)',
         skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',
diff --git a/packages/gitbook/package.json b/packages/gitbook/package.json
@@ -46,13 +46,20 @@
         "katex": "^0.16.9",
         "mathjax": "^3.2.2",
         "mdast-util-to-markdown": "^2.1.2",
+        "mdast-util-from-markdown": "^2.0.2",
+        "mdast-util-frontmatter": "^2.0.1",
+        "mdast-util-gfm": "^3.1.0",
+        "micromark-extension-gfm": "^3.0.0",
+        "micromark-extension-frontmatter": "^2.0.0",
+        "unist-util-remove": "^4.0.0",
+        "unist-util-visit": "^5.0.0",
         "memoizee": "^0.4.17",
         "next": "14.2.26",
         "next-themes": "^0.2.1",
         "nuqs": "^2.2.3",
         "object-hash": "^3.0.0",
         "openapi-types": "^12.1.3",
-        "p-map": "^7.0.0",
+        "p-map": "^7.0.3",
         "parse-cache-control": "^1.0.1",
         "partial-json": "^0.1.7",
         "react": "^19.0.0",
diff --git a/packages/gitbook/src/lib/urls.ts b/packages/gitbook/src/lib/urls.ts
@@ -8,3 +8,17 @@ export function checkIsHttpURL(input: string | URL): boolean {
     const parsed = new URL(input);
     return parsed.protocol === 'http:' || parsed.protocol === 'https:';
 }
+
+/**
+ * True for absolute URLs (`scheme:*`) or hash-only anchors.
+ */
+export function checkIsExternalURL(input: string): boolean {
+    return URL.canParse(input);
+}
+
+/**
+ * True for a hash-only anchor.
+ */
+export function checkIsAnchor(input: string): boolean {
+    return input.startsWith('#');
+}
diff --git a/packages/gitbook/src/routes/llms-full.ts b/packages/gitbook/src/routes/llms-full.ts
@@ -0,0 +1,209 @@
+import path from 'node:path';
+import { joinPath } from '@/lib/paths';
+import { getIndexablePages } from '@/lib/sitemap';
+import { getSiteStructureSections } from '@/lib/sites';
+import { checkIsAnchor, checkIsExternalURL } from '@/lib/urls';
+import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api';
+import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context';
+import { throwIfDataError } from '@v2/lib/data';
+import assertNever from 'assert-never';
+import type { Link, Paragraph, Root } from 'mdast';
+import { fromMarkdown } from 'mdast-util-from-markdown';
+import { frontmatterFromMarkdown } from 'mdast-util-frontmatter';
+import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm';
+import { toMarkdown } from 'mdast-util-to-markdown';
+import { frontmatter } from 'micromark-extension-frontmatter';
+import { gfm } from 'micromark-extension-gfm';
+import { pMapIterable } from 'p-map';
+import { remove } from 'unist-util-remove';
+import { visit } from 'unist-util-visit';
+
+// We limit the concurrency to 100 to avoid reaching limit with concurrent requests
+// or file descriptor limits.
+const MAX_CONCURRENCY = 100;
+
+/**
+ * Generate a llms-full.txt file for the site.
+ * As the result can be large, we stream it as we generate it.
+ */
+export async function serveLLMsFullTxt(context: GitBookSiteContext) {
+    if (!checkIsRootSiteContext(context)) {
+        return new Response('llms.txt is only served from the root of the site', { status: 404 });
+    }
+
+    return new Response(
+        new ReadableStream<Uint8Array>({
+            async pull(controller) {
+                await streamMarkdownFromSiteStructure(context, controller);
+                controller.close();
+            },
+        }),
+        {
+            headers: {
+                'Content-Type': 'text/markdown; charset=utf-8',
+            },
+        }
+    );
+}
+
+/**
+ * Stream markdown from site structure.
+ */
+async function streamMarkdownFromSiteStructure(
+    context: GitBookSiteContext,
+    stream: ReadableStreamDefaultController<Uint8Array>
+): Promise<void> {
+    switch (context.structure.type) {
+        case 'sections':
+            return streamMarkdownFromSections(
+                context,
+                stream,
+                getSiteStructureSections(context.structure, { ignoreGroups: true })
+            );
+        case 'siteSpaces':
+            return streamMarkdownFromSiteSpaces(context, stream, context.structure.structure, '');
+        default:
+            assertNever(context.structure);
+    }
+}
+
+/**
+ * Stream markdown from site sections.
+ */
+async function streamMarkdownFromSections(
+    context: GitBookSiteContext,
+    stream: ReadableStreamDefaultController<Uint8Array>,
+    siteSections: SiteSection[]
+): Promise<void> {
+    for (const siteSection of siteSections) {
+        await streamMarkdownFromSiteSpaces(
+            context,
+            stream,
+            siteSection.siteSpaces,
+            siteSection.path
+        );
+    }
+}
+
+/**
+ * Stream markdown from site spaces.
+ */
+async function streamMarkdownFromSiteSpaces(
+    context: GitBookSiteContext,
+    stream: ReadableStreamDefaultController<Uint8Array>,
+    siteSpaces: SiteSpace[],
+    basePath: string
+): Promise<void> {
+    const { dataFetcher } = context;
+
+    for (const siteSpace of siteSpaces) {
+        const siteSpaceUrl = siteSpace.urls.published;
+        if (!siteSpaceUrl) {
+            continue;
+        }
+        const rootPages = await throwIfDataError(
+            dataFetcher.getRevisionPages({
+                spaceId: siteSpace.space.id,
+                revisionId: siteSpace.space.revision,
+                metadata: false,
+            })
+        );
+        const pages = getIndexablePages(rootPages);
+
+        for await (const markdown of pMapIterable(
+            pages,
+            async ({ page }) => {
+                if (page.type !== 'document') {
+                    return '';
+                }
+
+                return getMarkdownForPage(
+                    context,
+                    siteSpace,
+                    page,
+                    joinPath(basePath, siteSpace.path)
+                );
+            },
+            {
+                concurrency: MAX_CONCURRENCY,
+            }
+        )) {
+            stream.enqueue(new TextEncoder().encode(markdown));
+        }
+    }
+}
+
+/**
+ * Get markdown from a page.
+ */
+async function getMarkdownForPage(
+    context: GitBookSiteContext,
+    siteSpace: SiteSpace,
+    page: RevisionPageDocument,
+    basePath: string
+): Promise<string> {
+    const { dataFetcher } = context;
+
+    const pageMarkdown = await throwIfDataError(
+        dataFetcher.getRevisionPageMarkdown({
+            spaceId: siteSpace.space.id,
+            revisionId: siteSpace.space.revision,
+            pageId: page.id,
+        })
+    );
+
+    const tree = fromMarkdown(pageMarkdown, {
+        extensions: [frontmatter(['yaml']), gfm()],
+        mdastExtensions: [frontmatterFromMarkdown(['yaml']), gfmFromMarkdown()],
+    });
+
+    // Remove frontmatter
+    remove(tree, 'yaml');
+
+    if (page.description) {
+        // The first node is the page title as a H1, we insert the description as a paragraph
+        // after it.
+        const descriptionNode: Paragraph = {
+            type: 'paragraph',
+            children: [{ type: 'text', value: page.description }],
+        };
+        tree.children.splice(1, 0, descriptionNode);
+    }
+
+    // Rewrite relative links to absolute links
+    transformLinks(context, tree, { currentPagePath: page.path, basePath });
+
+    const markdown = toMarkdown(tree, { extensions: [gfmToMarkdown()] });
+    return `${markdown}\n\n`;
+}
+
+/**
+ * Re-writes the URL of every relative <a> link so it is expressed from the site-root.
+ */
+export function transformLinks(
+    context: GitBookSiteContext,
+    tree: Root,
+    options: { currentPagePath: string; basePath: string }
+): Root {
+    const { linker } = context;
+    const { currentPagePath, basePath } = options;
+    const currentDir = path.posix.dirname(currentPagePath);
+
+    visit(tree, 'link', (node: Link) => {
+        const original = node.url;
+
+        // Skip anchors, mailto:, http(s):, protocol-like, or already-rooted paths
+        if (checkIsExternalURL(original) || checkIsAnchor(original) || original.startsWith('/')) {
+            return;
+        }
+
+        // Resolve against the current page’s directory and strip any leading “/”
+        const pathInSite = path.posix
+            .normalize(path.posix.join(basePath, currentDir, original))
+            .replace(/^\/+/, '');
+
+        node.url = linker.toPathInSite(pathInSite);
+    });
+
+    return tree;
+}
diff --git a/packages/gitbook/src/routes/llms.ts b/packages/gitbook/src/routes/llms.ts
@@ -46,7 +46,7 @@ export async function serveLLMsTxt(
         }),
         {
             headers: {
-                'Content-Type': 'text/plain; charset=utf-8',
+                'Content-Type': 'text/markdown; charset=utf-8',
             },
         }
     );

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"gitbook-v2": patch
 +---
++
 +Generate a llms-full.txt version of the docs site
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ export async function serveLLMsTxt(`
`46`	`46`	`}),`
`47`	`47`	`{`
`48`	`48`	`headers: {`
`49`		`- 'Content-Type': 'text/plain; charset=utf-8',`
	`49`	`+ 'Content-Type': 'text/markdown; charset=utf-8',`
`50`	`50`	`},`
`51`	`51`	`}`
`52`	`52`	`);`