Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,15 @@
"HPKE",
"lifecycles",
"llms",
"Llms"
"Llms",
"TTL",
Comment thread
nickytonline marked this conversation as resolved.
Outdated
"relogin",
"quantiles",
"pgxpool",
"otelpgx",
"mdxjs",
"exaring",
"fjhsb"
],
"ignorePaths": [
"*.mp4",
Expand Down
7 changes: 6 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-mailchimp-subscribe": "^2.1.3",
"redocusaurus": "^2.2.3"
"redocusaurus": "^2.5.0",
Comment thread
nickytonline marked this conversation as resolved.
Outdated
"remark": "^15.0.1",
"remark-mdx": "^3.1.0",
"remark-parse": "^11.0.0",
"remark-stringify": "^11.0.0",
"unist-util-visit": "^5.0.0"
},
"browserslist": {
"production": [
Expand Down
283 changes: 216 additions & 67 deletions plugins/llms-txt-plugin.js
Original file line number Diff line number Diff line change
@@ -1,54 +1,102 @@
const fs = require('fs');
const path = require('path');
const {remark} = require('remark');
const remarkParse = require('remark-parse').default;
const remarkStringify = require('remark-stringify').default;
const remarkMdx = require('remark-mdx').default;
const visit = require('unist-util-visit').visit;

/**
* Docusaurus plugin to generate llms.txt file for LLM consumption
* Based on the actual documentation structure and metadata
* Pre-clean Docusaurus MDX files before remark parsing.
* - Removes all :::admonition blocks (keeps the content inside)
* - Removes <Tabs>...</Tabs> and <TabItem ...>...</TabItem> blocks (keeps content inside)
* - Strips any remaining standalone ::: lines
*/
async function pluginLlmsTxt(context, options) {
return {
name: 'llms-txt-plugin',
function preCleanDocusaurusMDX(raw) {
// Remove all Docusaurus import lines
raw = raw.replace(/^\s*import\s.*from\s+['"][^'"]+['"];?\s*$/gm, '');

async postBuild({outDir, routes, ...buildContext}) {
try {
// Use Docusaurus routes data instead of manual file scanning
const docRoutes = filterDocumentationRoutes(routes);
// Remove <Tabs>, </Tabs>, <TabItem ...>, </TabItem>
raw = raw.replace(/<\/?Tabs[^>]*>/g, '');
raw = raw.replace(/<\/?TabItem[^>]*>/g, '');

// If routes are not properly populated, fallback to file scanning
let finalRoutes = docRoutes;
if (docRoutes.length < 10) {
console.log(
'⚠️ Routes seem incomplete, falling back to file scanning...',
);
const contentDir = path.join(context.siteDir, 'content', 'docs');
finalRoutes = await scanDocumentationFiles(contentDir);
// Remove all Docusaurus admonition blocks (with all contents)
raw = raw.replace(
/^:::(info|note|caution|tip|danger|important|success|failure|admonition)[^\n]*\n([\s\S]*?)^:::\s*$/gm,
'',
);

// Remove any "orphan" closing or opening :::
raw = raw.replace(/^:::\s*$/gm, '');

// Optionally strip extra blank lines
raw = raw.replace(/^\s*\n/gm, '');

return raw;
}

/**
* Clean Markdown/MDX for LLM ingestion using remark AST
* - Strips YAML frontmatter
* - Strips import/export statements (MDX)
* - Strips MDX JSX elements and self-closing components
* - Removes images (optional: swap with alt text)
* - Leaves code blocks and inline code untouched
*/
async function cleanMarkdownForLLM(rawContent) {
const processor = remark()
.use(remarkParse)
.use(remarkMdx)
.use(() => (tree) => {
// Remove YAML frontmatter
tree.children = tree.children.filter(
(node) =>
node.type !== 'yaml' && // Remove frontmatter
node.type !== 'mdxjsEsm' && // Remove import/export
node.type !== 'mdxJsxFlowElement' &&
node.type !== 'mdxJsxTextElement',
);
// Remove images
visit(tree, (node, index, parent) => {
if (node.type === 'image' && parent && typeof index === 'number') {
parent.children.splice(index, 1);
}
});
})
.use(remarkStringify, {
fences: true,
bullet: '-',
rule: '-',
listItemIndent: 'one',
// Important! Do not stringify YAML nodes (just in case)
// This setting may not exist, but keeping for emphasis.
});

// Auto-detect categories from sidebars and route structure
const routesByCategory = await groupRoutesByCategoryAuto(
finalRoutes,
context,
);
const result = await processor.process(rawContent);

// Generate the llms.txt content
const llmsTxtContent = generateLlmsTxtContent(
routesByCategory,
context,
);
// Remove any accidental lingering frontmatter at the start
let cleaned = result.value
.replace(/^---[\s\S]*?---+\s*/m, '') // Remove frontmatter with any number of dashes
.replace(/^(?:-{3,}|\*{3,}|_{3,})\s*\n+/gm, '') // Remove HRs at file start
.trim();

// Write the llms.txt file to the build output
const llmsTxtPath = path.join(outDir, 'llms.txt');
await fs.promises.writeFile(llmsTxtPath, llmsTxtContent, 'utf8');
// Optionally trim excessive blank lines
return cleaned.replace(/\n{3,}/g, '\n\n');
}

console.log(
`✅ Generated llms.txt with ${finalRoutes.length} documentation pages`,
);
} catch (error) {
console.error('❌ Error generating llms.txt:', error);
throw error;
}
},
};
/**
* Filter routes to only include documentation routes
*/
function filterDocumentationRoutes(routes) {
return routes.filter((route) => {
return (
route.path.startsWith('/docs/') &&
route.path !== '/docs/' &&
!route.path.includes('/api/') &&
!route.path.includes('/_') &&
route.component !== '@theme/NotFound/Content'
);
});
}

/**
Expand Down Expand Up @@ -230,9 +278,6 @@ function isCoreConcept(subsection) {
* Generate the llms.txt file content
*/
function generateLlmsTxtContent(routesByCategory, context) {
const {siteConfig} = context;
const baseUrl = siteConfig.url;

let content = `# Pomerium Documentation

This file contains information about Pomerium's public documentation to help LLMs understand and reference our documentation.
Expand All @@ -241,22 +286,36 @@ Pomerium is an identity and context-aware access proxy that provides secure acce

`;

// Generate sections for each category
Object.entries(routesByCategory).forEach(([categoryName, routes]) => {
content += `## ${categoryName}\n\n`;

// Sort routes for better organization
const sortedRoutes = sortRoutes(routes);

sortedRoutes.forEach((route) => {
const title = getRouteTitle(route);
const description = getRouteDescription(route);
const url = `${baseUrl}${route.path}`;

if (description) {
content += `- [${title}](${url}): ${description}\n`;
let mdFilePath = route.filePath || '';
// Make the path relative to the docs root for the link
let relPath = '';
if (mdFilePath) {
const docsRoot = path.join(context.siteDir, 'content', 'docs');
relPath = path.relative(docsRoot, mdFilePath).replace(/\\/g, '/');
if (relPath.startsWith('..')) {
relPath = path.basename(mdFilePath);
}
// Always output .md extension
relPath = relPath.replace(/\.(mdx|md)$/, '.md');
}
const mdUrl = relPath ? `content/docs/${relPath}` : '';

if (description && mdUrl) {
content += `- [${title}](${mdUrl}): ${description}\n`;
} else if (mdUrl) {
content += `- [${title}](${mdUrl})\n`;
} else if (description) {
content += `- ${title}: ${description}\n`;
} else {
content += `- [${title}](${url})\n`;
content += `- ${title}\n`;
}
});

Expand Down Expand Up @@ -329,27 +388,9 @@ function getRouteDescription(route) {
if (route.metadata?.description) {
return route.metadata.description;
}

// Could add logic to extract description from frontmatter
// or generate default descriptions based on category/path
return null;
}

/**
* Filter routes to only include documentation routes
*/
function filterDocumentationRoutes(routes) {
return routes.filter((route) => {
return (
route.path.startsWith('/docs/') &&
route.path !== '/docs/' &&
!route.path.includes('/api/') &&
!route.path.includes('/_') &&
route.component !== '@theme/NotFound/Content'
);
});
}

/**
* Auto-detect categories by parsing sidebars and analyzing route structure
*/
Expand Down Expand Up @@ -546,4 +587,112 @@ function sortCategories(categories) {
return sorted;
}

/**
* Main plugin
*/
async function pluginLlmsTxt(context, options) {
return {
name: 'llms-txt-plugin',

async postBuild({outDir, routes, ...buildContext}) {
try {
// Use Docusaurus routes data instead of manual file scanning
const docRoutes = filterDocumentationRoutes(routes);

// If routes are not properly populated, fallback to file scanning
let finalRoutes = docRoutes;
if (docRoutes.length < 10) {
console.log(
'⚠️ Routes seem incomplete, falling back to file scanning...',
);
const contentDir = path.join(context.siteDir, 'content', 'docs');
finalRoutes = await scanDocumentationFiles(contentDir);
}

// Auto-detect categories from sidebars and route structure
const routesByCategory = await groupRoutesByCategoryAuto(
finalRoutes,
context,
);

// Generate the llms.txt content
const llmsTxtContent = generateLlmsTxtContent(
routesByCategory,
context,
);

// Write the llms.txt file to the build output
const llmsTxtPath = path.join(outDir, 'llms.txt');
await fs.promises.writeFile(llmsTxtPath, llmsTxtContent, 'utf8');

// Copy and CLEAN referenced markdown files to the build output (as .md)
const docsRoot = path.join(context.siteDir, 'content', 'docs');
let copied = 0,
errors = 0;
for (const route of finalRoutes) {
let srcFile = route.filePath;
if (!srcFile) continue;

// Add skip logic here
const skipPatterns = [
'_template.mdx',
/^_/, // files/folders starting with "_",
'versions.mdx',
];
if (
skipPatterns.some((pat) =>
typeof pat === 'string'
? srcFile.includes(pat)
: pat.test(path.basename(srcFile)),
)
) {
continue; // Skip this file silently
}

// Always copy as .md
let destRel = path.relative(docsRoot, srcFile).replace(/\\/g, '/');
destRel = destRel.replace(/\.(mdx|md)$/, '.md');
if (destRel.startsWith('..')) {
destRel = path.basename(srcFile).replace(/\.(mdx|md)$/, '.md');
}
const destFile = path.join(outDir, 'content', 'docs', destRel);
// Ensure directory exists
await fs.promises.mkdir(path.dirname(destFile), {recursive: true});
// Read, clean, and write file
try {
const rawContent = await fs.promises.readFile(srcFile, 'utf8');
const preCleanedContent = preCleanDocusaurusMDX(rawContent);
let cleanedContent;
try {
cleanedContent = await cleanMarkdownForLLM(preCleanedContent);
} catch (err) {
console.warn(
`Warning: Could not fully process ${srcFile}, writing pre-cleaned version instead:`,
err.message,
);
cleanedContent = preCleanedContent;
errors++;
}
await fs.promises.writeFile(destFile, cleanedContent, 'utf8');
copied++;
} catch (err) {
console.warn(
`Warning: Could not read or write ${srcFile}:`,
err.message,
);
errors++;
}
}

console.log(
`✅ Generated llms.txt with ${finalRoutes.length} documentation pages. Copied and cleaned ${copied} markdown sources. ${errors ? `(${errors} errors)` : ''}`,
);
} catch (error) {
console.error('❌ Error generating llms.txt:', error);
throw error;
}
},
};
}

module.exports = pluginLlmsTxt;
10 changes: 5 additions & 5 deletions static/js/syft.js
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
!(function (t) {
if (((window.syftc = t), window.syft)) return;
(window.syft = []),
((window.syft = []),
['identify', 'track', 'page'].forEach(function (t) {
window.syft[t] = function () {
var s = [].slice.call(arguments);
s.unshift(t), window.syft.push(s);
(s.unshift(t), window.syft.push(s));
};
});
}));
var s = document.createElement('script');
(s.async = !0),
((s.async = !0),
s.setAttribute('src', 'https://cdn.syftdata.com/syftnext/syft.umd.js'),
(document.body || document.head).appendChild(s);
(document.body || document.head).appendChild(s));
})({sourceId: 'clrqv5nli0007jv09t58zir92'});
Loading