Skip to content

Commit 33726c8

Browse files
authored
Generate a llms-full version of the docs (#3285)
1 parent 6aa3ff9 commit 33726c8

File tree

8 files changed

+319
-7
lines changed

8 files changed

+319
-7
lines changed

.changeset/orange-hounds-sparkle.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"gitbook-v2": patch
3+
---
4+
5+
Generate a llms-full.txt version of the docs site

bun.lock

Lines changed: 36 additions & 5 deletions
Large diffs are not rendered by default.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import type { NextRequest } from 'next/server';
2+
3+
import { serveLLMsFullTxt } from '@/routes/llms-full';
4+
import { type RouteLayoutParams, getStaticSiteContext } from '@v2/app/utils';
5+
6+
export const dynamic = 'force-static';
7+
8+
export async function GET(
9+
_request: NextRequest,
10+
{ params }: { params: Promise<RouteLayoutParams> }
11+
) {
12+
const { context } = await getStaticSiteContext(await params);
13+
return serveLLMsFullTxt(context);
14+
}

packages/gitbook/e2e/internal.spec.ts

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,38 @@ const testCases: TestsCase[] = [
431431
},
432432
],
433433
},
434+
{
435+
name: 'llms.txt',
436+
skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',
437+
contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/',
438+
tests: [
439+
{
440+
name: 'llms.txt',
441+
url: 'llms.txt',
442+
screenshot: false,
443+
run: async (_page, response) => {
444+
expect(response?.status()).toBe(200);
445+
expect(response?.headers()['content-type']).toContain('text/markdown');
446+
},
447+
},
448+
],
449+
},
450+
{
451+
name: 'llms-full.txt',
452+
skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',
453+
contentBaseURL: 'https://gitbook.gitbook.io/test-gitbook-open/',
454+
tests: [
455+
{
456+
name: 'llms-full.txt',
457+
url: 'llms-full.txt',
458+
screenshot: false,
459+
run: async (_page, response) => {
460+
expect(response?.status()).toBe(200);
461+
expect(response?.headers()['content-type']).toContain('text/markdown');
462+
},
463+
},
464+
],
465+
},
434466
{
435467
name: 'Site subdirectory (proxy)',
436468
skip: process.env.ARGOS_BUILD_NAME !== 'v2-vercel',

packages/gitbook/package.json

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,20 @@
4646
"katex": "^0.16.9",
4747
"mathjax": "^3.2.2",
4848
"mdast-util-to-markdown": "^2.1.2",
49+
"mdast-util-from-markdown": "^2.0.2",
50+
"mdast-util-frontmatter": "^2.0.1",
51+
"mdast-util-gfm": "^3.1.0",
52+
"micromark-extension-gfm": "^3.0.0",
53+
"micromark-extension-frontmatter": "^2.0.0",
54+
"unist-util-remove": "^4.0.0",
55+
"unist-util-visit": "^5.0.0",
4956
"memoizee": "^0.4.17",
5057
"next": "14.2.26",
5158
"next-themes": "^0.2.1",
5259
"nuqs": "^2.2.3",
5360
"object-hash": "^3.0.0",
5461
"openapi-types": "^12.1.3",
55-
"p-map": "^7.0.0",
62+
"p-map": "^7.0.3",
5663
"parse-cache-control": "^1.0.1",
5764
"partial-json": "^0.1.7",
5865
"react": "^19.0.0",

packages/gitbook/src/lib/urls.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,17 @@ export function checkIsHttpURL(input: string | URL): boolean {
88
const parsed = new URL(input);
99
return parsed.protocol === 'http:' || parsed.protocol === 'https:';
1010
}
11+
12+
/**
13+
* True for absolute URLs (`scheme:*`) or hash-only anchors.
14+
*/
15+
export function checkIsExternalURL(input: string): boolean {
16+
return URL.canParse(input);
17+
}
18+
19+
/**
20+
* True for a hash-only anchor.
21+
*/
22+
export function checkIsAnchor(input: string): boolean {
23+
return input.startsWith('#');
24+
}
Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import path from 'node:path';
2+
import { joinPath } from '@/lib/paths';
3+
import { getIndexablePages } from '@/lib/sitemap';
4+
import { getSiteStructureSections } from '@/lib/sites';
5+
import { checkIsAnchor, checkIsExternalURL } from '@/lib/urls';
6+
import type { RevisionPageDocument, SiteSection, SiteSpace } from '@gitbook/api';
7+
import { type GitBookSiteContext, checkIsRootSiteContext } from '@v2/lib/context';
8+
import { throwIfDataError } from '@v2/lib/data';
9+
import assertNever from 'assert-never';
10+
import type { Link, Paragraph, Root } from 'mdast';
11+
import { fromMarkdown } from 'mdast-util-from-markdown';
12+
import { frontmatterFromMarkdown } from 'mdast-util-frontmatter';
13+
import { gfmFromMarkdown, gfmToMarkdown } from 'mdast-util-gfm';
14+
import { toMarkdown } from 'mdast-util-to-markdown';
15+
import { frontmatter } from 'micromark-extension-frontmatter';
16+
import { gfm } from 'micromark-extension-gfm';
17+
import { pMapIterable } from 'p-map';
18+
import { remove } from 'unist-util-remove';
19+
import { visit } from 'unist-util-visit';
20+
21+
// We limit the concurrency to 100 to avoid reaching limit with concurrent requests
22+
// or file descriptor limits.
23+
const MAX_CONCURRENCY = 100;
24+
25+
/**
26+
* Generate a llms-full.txt file for the site.
27+
* As the result can be large, we stream it as we generate it.
28+
*/
29+
export async function serveLLMsFullTxt(context: GitBookSiteContext) {
30+
if (!checkIsRootSiteContext(context)) {
31+
return new Response('llms.txt is only served from the root of the site', { status: 404 });
32+
}
33+
34+
return new Response(
35+
new ReadableStream<Uint8Array>({
36+
async pull(controller) {
37+
await streamMarkdownFromSiteStructure(context, controller);
38+
controller.close();
39+
},
40+
}),
41+
{
42+
headers: {
43+
'Content-Type': 'text/markdown; charset=utf-8',
44+
},
45+
}
46+
);
47+
}
48+
49+
/**
50+
* Stream markdown from site structure.
51+
*/
52+
async function streamMarkdownFromSiteStructure(
53+
context: GitBookSiteContext,
54+
stream: ReadableStreamDefaultController<Uint8Array>
55+
): Promise<void> {
56+
switch (context.structure.type) {
57+
case 'sections':
58+
return streamMarkdownFromSections(
59+
context,
60+
stream,
61+
getSiteStructureSections(context.structure, { ignoreGroups: true })
62+
);
63+
case 'siteSpaces':
64+
return streamMarkdownFromSiteSpaces(context, stream, context.structure.structure, '');
65+
default:
66+
assertNever(context.structure);
67+
}
68+
}
69+
70+
/**
71+
* Stream markdown from site sections.
72+
*/
73+
async function streamMarkdownFromSections(
74+
context: GitBookSiteContext,
75+
stream: ReadableStreamDefaultController<Uint8Array>,
76+
siteSections: SiteSection[]
77+
): Promise<void> {
78+
for (const siteSection of siteSections) {
79+
await streamMarkdownFromSiteSpaces(
80+
context,
81+
stream,
82+
siteSection.siteSpaces,
83+
siteSection.path
84+
);
85+
}
86+
}
87+
88+
/**
89+
* Stream markdown from site spaces.
90+
*/
91+
async function streamMarkdownFromSiteSpaces(
92+
context: GitBookSiteContext,
93+
stream: ReadableStreamDefaultController<Uint8Array>,
94+
siteSpaces: SiteSpace[],
95+
basePath: string
96+
): Promise<void> {
97+
const { dataFetcher } = context;
98+
99+
for (const siteSpace of siteSpaces) {
100+
const siteSpaceUrl = siteSpace.urls.published;
101+
if (!siteSpaceUrl) {
102+
continue;
103+
}
104+
const rootPages = await throwIfDataError(
105+
dataFetcher.getRevisionPages({
106+
spaceId: siteSpace.space.id,
107+
revisionId: siteSpace.space.revision,
108+
metadata: false,
109+
})
110+
);
111+
const pages = getIndexablePages(rootPages);
112+
113+
for await (const markdown of pMapIterable(
114+
pages,
115+
async ({ page }) => {
116+
if (page.type !== 'document') {
117+
return '';
118+
}
119+
120+
return getMarkdownForPage(
121+
context,
122+
siteSpace,
123+
page,
124+
joinPath(basePath, siteSpace.path)
125+
);
126+
},
127+
{
128+
concurrency: MAX_CONCURRENCY,
129+
}
130+
)) {
131+
stream.enqueue(new TextEncoder().encode(markdown));
132+
}
133+
}
134+
}
135+
136+
/**
137+
* Get markdown from a page.
138+
*/
139+
async function getMarkdownForPage(
140+
context: GitBookSiteContext,
141+
siteSpace: SiteSpace,
142+
page: RevisionPageDocument,
143+
basePath: string
144+
): Promise<string> {
145+
const { dataFetcher } = context;
146+
147+
const pageMarkdown = await throwIfDataError(
148+
dataFetcher.getRevisionPageMarkdown({
149+
spaceId: siteSpace.space.id,
150+
revisionId: siteSpace.space.revision,
151+
pageId: page.id,
152+
})
153+
);
154+
155+
const tree = fromMarkdown(pageMarkdown, {
156+
extensions: [frontmatter(['yaml']), gfm()],
157+
mdastExtensions: [frontmatterFromMarkdown(['yaml']), gfmFromMarkdown()],
158+
});
159+
160+
// Remove frontmatter
161+
remove(tree, 'yaml');
162+
163+
if (page.description) {
164+
// The first node is the page title as a H1, we insert the description as a paragraph
165+
// after it.
166+
const descriptionNode: Paragraph = {
167+
type: 'paragraph',
168+
children: [{ type: 'text', value: page.description }],
169+
};
170+
tree.children.splice(1, 0, descriptionNode);
171+
}
172+
173+
// Rewrite relative links to absolute links
174+
transformLinks(context, tree, { currentPagePath: page.path, basePath });
175+
176+
const markdown = toMarkdown(tree, { extensions: [gfmToMarkdown()] });
177+
return `${markdown}\n\n`;
178+
}
179+
180+
/**
181+
* Re-writes the URL of every relative <a> link so it is expressed from the site-root.
182+
*/
183+
export function transformLinks(
184+
context: GitBookSiteContext,
185+
tree: Root,
186+
options: { currentPagePath: string; basePath: string }
187+
): Root {
188+
const { linker } = context;
189+
const { currentPagePath, basePath } = options;
190+
const currentDir = path.posix.dirname(currentPagePath);
191+
192+
visit(tree, 'link', (node: Link) => {
193+
const original = node.url;
194+
195+
// Skip anchors, mailto:, http(s):, protocol-like, or already-rooted paths
196+
if (checkIsExternalURL(original) || checkIsAnchor(original) || original.startsWith('/')) {
197+
return;
198+
}
199+
200+
// Resolve against the current page’s directory and strip any leading “/”
201+
const pathInSite = path.posix
202+
.normalize(path.posix.join(basePath, currentDir, original))
203+
.replace(/^\/+/, '');
204+
205+
node.url = linker.toPathInSite(pathInSite);
206+
});
207+
208+
return tree;
209+
}

packages/gitbook/src/routes/llms.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ export async function serveLLMsTxt(
4646
}),
4747
{
4848
headers: {
49-
'Content-Type': 'text/plain; charset=utf-8',
49+
'Content-Type': 'text/markdown; charset=utf-8',
5050
},
5151
}
5252
);

0 commit comments

Comments
 (0)