diff --git a/cspell.json b/cspell.json index 452365e0c..08d1f7e05 100644 --- a/cspell.json +++ b/cspell.json @@ -231,7 +231,14 @@ "HPKE", "lifecycles", "llms", - "Llms" + "Llms", + "relogin", + "quantiles", + "pgxpool", + "otelpgx", + "mdxjs", + "exaring", + "fjhsb" ], "ignorePaths": [ "*.mp4", diff --git a/package.json b/package.json index 54ab2cdaf..2449be36f 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,12 @@ "react": "^18.2.0", "react-dom": "^18.2.0", "react-mailchimp-subscribe": "^2.1.3", - "redocusaurus": "^2.2.3" + "redocusaurus": "^2.2.3", + "remark": "^15.0.1", + "remark-mdx": "^3.1.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unist-util-visit": "^5.0.0" }, "browserslist": { "production": [ diff --git a/plugins/llms-txt-plugin.js b/plugins/llms-txt-plugin.js index e240e8576..7c7f72dca 100644 --- a/plugins/llms-txt-plugin.js +++ b/plugins/llms-txt-plugin.js @@ -1,54 +1,102 @@ const fs = require('fs'); const path = require('path'); +const {remark} = require('remark'); +const remarkParse = require('remark-parse').default; +const remarkStringify = require('remark-stringify').default; +const remarkMdx = require('remark-mdx').default; +const visit = require('unist-util-visit').visit; /** - * Docusaurus plugin to generate llms.txt file for LLM consumption - * Based on the actual documentation structure and metadata + * Pre-clean Docusaurus MDX files before remark parsing. + * - Removes all :::admonition blocks (keeps the content inside) + * - Removes ... and ... blocks (keeps content inside) + * - Strips any remaining standalone ::: lines */ -async function pluginLlmsTxt(context, options) { - return { - name: 'llms-txt-plugin', +function preCleanDocusaurusMDX(raw) { + // Remove all Docusaurus import lines + raw = raw.replace(/^\s*import\s.*from\s+['"][^'"]+['"];?\s*$/gm, ''); - async postBuild({outDir, routes, ...buildContext}) { - try { - // Use Docusaurus routes data instead of manual file scanning - const docRoutes = filterDocumentationRoutes(routes); + // Remove , , , + raw = raw.replace(/<\/?Tabs[^>]*>/g, ''); + raw = raw.replace(/<\/?TabItem[^>]*>/g, ''); - // If routes are not properly populated, fallback to file scanning - let finalRoutes = docRoutes; - if (docRoutes.length < 10) { - console.log( - '⚠️ Routes seem incomplete, falling back to file scanning...', - ); - const contentDir = path.join(context.siteDir, 'content', 'docs'); - finalRoutes = await scanDocumentationFiles(contentDir); + // Remove all Docusaurus admonition blocks (with all contents) + raw = raw.replace( + /^:::(info|note|caution|tip|danger|important|success|failure|admonition)[^\n]*\n([\s\S]*?)^:::\s*$/gm, + '', + ); + + // Remove any "orphan" closing or opening ::: + raw = raw.replace(/^:::\s*$/gm, ''); + + // Optionally strip extra blank lines + raw = raw.replace(/^\s*\n/gm, ''); + + return raw; +} + +/** + * Clean Markdown/MDX for LLM ingestion using remark AST + * - Strips YAML frontmatter + * - Strips import/export statements (MDX) + * - Strips MDX JSX elements and self-closing components + * - Removes images (optional: swap with alt text) + * - Leaves code blocks and inline code untouched + */ +async function cleanMarkdownForLLM(rawContent) { + const processor = remark() + .use(remarkParse) + .use(remarkMdx) + .use(() => (tree) => { + // Remove YAML frontmatter + tree.children = tree.children.filter( + (node) => + node.type !== 'yaml' && // Remove frontmatter + node.type !== 'mdxjsEsm' && // Remove import/export + node.type !== 'mdxJsxFlowElement' && + node.type !== 'mdxJsxTextElement', + ); + // Remove images + visit(tree, (node, index, parent) => { + if (node.type === 'image' && parent && typeof index === 'number') { + parent.children.splice(index, 1); } + }); + }) + .use(remarkStringify, { + fences: true, + bullet: '-', + rule: '-', + listItemIndent: 'one', + // Important! Do not stringify YAML nodes (just in case) + // This setting may not exist, but keeping for emphasis. + }); - // Auto-detect categories from sidebars and route structure - const routesByCategory = await groupRoutesByCategoryAuto( - finalRoutes, - context, - ); + const result = await processor.process(rawContent); - // Generate the llms.txt content - const llmsTxtContent = generateLlmsTxtContent( - routesByCategory, - context, - ); + // Remove any accidental lingering frontmatter at the start + let cleaned = result.value + .replace(/^---[\s\S]*?---+\s*/m, '') // Remove frontmatter with any number of dashes + .replace(/^(?:-{3,}|\*{3,}|_{3,})\s*\n+/gm, '') // Remove HRs at file start + .trim(); - // Write the llms.txt file to the build output - const llmsTxtPath = path.join(outDir, 'llms.txt'); - await fs.promises.writeFile(llmsTxtPath, llmsTxtContent, 'utf8'); + // Optionally trim excessive blank lines + return cleaned.replace(/\n{3,}/g, '\n\n'); +} - console.log( - `✅ Generated llms.txt with ${finalRoutes.length} documentation pages`, - ); - } catch (error) { - console.error('❌ Error generating llms.txt:', error); - throw error; - } - }, - }; +/** + * Filter routes to only include documentation routes + */ +function filterDocumentationRoutes(routes) { + return routes.filter((route) => { + return ( + route.path.startsWith('/docs/') && + route.path !== '/docs/' && + !route.path.includes('/api/') && + !route.path.includes('/_') && + route.component !== '@theme/NotFound/Content' + ); + }); } /** @@ -230,9 +278,6 @@ function isCoreConcept(subsection) { * Generate the llms.txt file content */ function generateLlmsTxtContent(routesByCategory, context) { - const {siteConfig} = context; - const baseUrl = siteConfig.url; - let content = `# Pomerium Documentation This file contains information about Pomerium's public documentation to help LLMs understand and reference our documentation. @@ -241,22 +286,36 @@ Pomerium is an identity and context-aware access proxy that provides secure acce `; - // Generate sections for each category Object.entries(routesByCategory).forEach(([categoryName, routes]) => { content += `## ${categoryName}\n\n`; - // Sort routes for better organization const sortedRoutes = sortRoutes(routes); sortedRoutes.forEach((route) => { const title = getRouteTitle(route); const description = getRouteDescription(route); - const url = `${baseUrl}${route.path}`; - - if (description) { - content += `- [${title}](${url}): ${description}\n`; + let mdFilePath = route.filePath || ''; + // Make the path relative to the docs root for the link + let relPath = ''; + if (mdFilePath) { + const docsRoot = path.join(context.siteDir, 'content', 'docs'); + relPath = path.relative(docsRoot, mdFilePath).replace(/\\/g, '/'); + if (relPath.startsWith('..')) { + relPath = path.basename(mdFilePath); + } + // Always output .md extension + relPath = relPath.replace(/\.(mdx|md)$/, '.md'); + } + const mdUrl = relPath ? `content/docs/${relPath}` : ''; + + if (description && mdUrl) { + content += `- [${title}](${mdUrl}): ${description}\n`; + } else if (mdUrl) { + content += `- [${title}](${mdUrl})\n`; + } else if (description) { + content += `- ${title}: ${description}\n`; } else { - content += `- [${title}](${url})\n`; + content += `- ${title}\n`; } }); @@ -329,27 +388,9 @@ function getRouteDescription(route) { if (route.metadata?.description) { return route.metadata.description; } - - // Could add logic to extract description from frontmatter - // or generate default descriptions based on category/path return null; } -/** - * Filter routes to only include documentation routes - */ -function filterDocumentationRoutes(routes) { - return routes.filter((route) => { - return ( - route.path.startsWith('/docs/') && - route.path !== '/docs/' && - !route.path.includes('/api/') && - !route.path.includes('/_') && - route.component !== '@theme/NotFound/Content' - ); - }); -} - /** * Auto-detect categories by parsing sidebars and analyzing route structure */ @@ -546,4 +587,112 @@ function sortCategories(categories) { return sorted; } +/** + * Main plugin + */ +async function pluginLlmsTxt(context, options) { + return { + name: 'llms-txt-plugin', + + async postBuild({outDir, routes, ...buildContext}) { + try { + // Use Docusaurus routes data instead of manual file scanning + const docRoutes = filterDocumentationRoutes(routes); + + // If routes are not properly populated, fallback to file scanning + let finalRoutes = docRoutes; + if (docRoutes.length < 10) { + console.log( + '⚠️ Routes seem incomplete, falling back to file scanning...', + ); + const contentDir = path.join(context.siteDir, 'content', 'docs'); + finalRoutes = await scanDocumentationFiles(contentDir); + } + + // Auto-detect categories from sidebars and route structure + const routesByCategory = await groupRoutesByCategoryAuto( + finalRoutes, + context, + ); + + // Generate the llms.txt content + const llmsTxtContent = generateLlmsTxtContent( + routesByCategory, + context, + ); + + // Write the llms.txt file to the build output + const llmsTxtPath = path.join(outDir, 'llms.txt'); + await fs.promises.writeFile(llmsTxtPath, llmsTxtContent, 'utf8'); + + // Copy and CLEAN referenced markdown files to the build output (as .md) + const docsRoot = path.join(context.siteDir, 'content', 'docs'); + let copied = 0, + errors = 0; + for (const route of finalRoutes) { + let srcFile = route.filePath; + if (!srcFile) continue; + + // Add skip logic here + const skipPatterns = [ + '_template.mdx', + /^_/, // files/folders starting with "_", + 'versions.mdx', + ]; + if ( + skipPatterns.some((pat) => + typeof pat === 'string' + ? srcFile.includes(pat) + : pat.test(path.basename(srcFile)), + ) + ) { + continue; // Skip this file silently + } + + // Always copy as .md + let destRel = path.relative(docsRoot, srcFile).replace(/\\/g, '/'); + destRel = destRel.replace(/\.(mdx|md)$/, '.md'); + if (destRel.startsWith('..')) { + destRel = path.basename(srcFile).replace(/\.(mdx|md)$/, '.md'); + } + const destFile = path.join(outDir, 'content', 'docs', destRel); + // Ensure directory exists + await fs.promises.mkdir(path.dirname(destFile), {recursive: true}); + // Read, clean, and write file + try { + const rawContent = await fs.promises.readFile(srcFile, 'utf8'); + const preCleanedContent = preCleanDocusaurusMDX(rawContent); + let cleanedContent; + try { + cleanedContent = await cleanMarkdownForLLM(preCleanedContent); + } catch (err) { + console.warn( + `Warning: Could not fully process ${srcFile}, writing pre-cleaned version instead:`, + err.message, + ); + cleanedContent = preCleanedContent; + errors++; + } + await fs.promises.writeFile(destFile, cleanedContent, 'utf8'); + copied++; + } catch (err) { + console.warn( + `Warning: Could not read or write ${srcFile}:`, + err.message, + ); + errors++; + } + } + + console.log( + `✅ Generated llms.txt with ${finalRoutes.length} documentation pages. Copied and cleaned ${copied} markdown sources. ${errors ? `(${errors} errors)` : ''}`, + ); + } catch (error) { + console.error('❌ Error generating llms.txt:', error); + throw error; + } + }, + }; +} + module.exports = pluginLlmsTxt; diff --git a/static/js/syft.js b/static/js/syft.js index fdfe2dfc3..e132467b3 100644 --- a/static/js/syft.js +++ b/static/js/syft.js @@ -1,14 +1,14 @@ !(function (t) { if (((window.syftc = t), window.syft)) return; - (window.syft = []), + ((window.syft = []), ['identify', 'track', 'page'].forEach(function (t) { window.syft[t] = function () { var s = [].slice.call(arguments); - s.unshift(t), window.syft.push(s); + (s.unshift(t), window.syft.push(s)); }; - }); + })); var s = document.createElement('script'); - (s.async = !0), + ((s.async = !0), s.setAttribute('src', 'https://cdn.syftdata.com/syftnext/syft.umd.js'), - (document.body || document.head).appendChild(s); + (document.body || document.head).appendChild(s)); })({sourceId: 'clrqv5nli0007jv09t58zir92'}); diff --git a/yarn.lock b/yarn.lock index 1f2c79d7d..f498509fa 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10280,7 +10280,7 @@ remark-gfm@^4.0.0: remark-stringify "^11.0.0" unified "^11.0.0" -remark-mdx@^3.0.0: +remark-mdx@^3.0.0, remark-mdx@^3.1.0: version "3.1.0" resolved "https://registry.npmjs.org/remark-mdx/-/remark-mdx-3.1.0.tgz" integrity sha512-Ngl/H3YXyBV9RcRNdlYsZujAmhsxwzxpDzpDEhFBVAGthS4GDgnctpDjgFl/ULx5UEDzqtW1cyBSNKqYYrqLBA== @@ -10318,6 +10318,16 @@ remark-stringify@^11.0.0: mdast-util-to-markdown "^2.0.0" unified "^11.0.0" +remark@^15.0.1: + version "15.0.1" + resolved "https://registry.yarnpkg.com/remark/-/remark-15.0.1.tgz#ac7e7563260513b66426bc47f850e7aa5862c37c" + integrity sha512-Eht5w30ruCXgFmxVUSlNWQ9iiimq07URKeFS3hNc8cUWy1llX4KDWfyEDZRycMc+znsN9Ux5/tJ/BFdgdOwA3A== + dependencies: + "@types/mdast" "^4.0.0" + remark-parse "^11.0.0" + remark-stringify "^11.0.0" + unified "^11.0.0" + renderkid@^3.0.0: version "3.0.0" resolved "https://registry.npmjs.org/renderkid/-/renderkid-3.0.0.tgz"