diff --git a/.polyresearch/result.json b/.polyresearch/result.json new file mode 100644 index 0000000000..4b1384a870 --- /dev/null +++ b/.polyresearch/result.json @@ -0,0 +1,6 @@ +{ + "observation": "improved", + "metric": 324.53, + "baseline": 319.02, + "summary": "Caching tokenizer and rules references in local variables within the blockTokens() and inlineTokens() hot loops reduced property lookups overhead. This eliminated repeated this.tokenizer and this.tokenizer.rules accesses on every iteration, improving throughput by ~1.7%." +} diff --git a/.polyresearch/thesis.md b/.polyresearch/thesis.md new file mode 100644 index 0000000000..590aa6a564 --- /dev/null +++ b/.polyresearch/thesis.md @@ -0,0 +1,3 @@ +# Thesis: Optimize hot path in Tokenizer loop + +Profile and optimize the main tokenization loop in Tokenizer.ts. Likely candidates: reduce function call overhead by inlining hot helper functions, eliminate redundant property lookups, or cache frequently accessed values. diff --git a/PREPARE.md b/PREPARE.md new file mode 100644 index 0000000000..93dd7ff2e7 --- /dev/null +++ b/PREPARE.md @@ -0,0 +1,46 @@ +# Evaluation Setup + +This file is outside the editable surface. It defines how results are judged. Agents cannot modify the evaluator or the scoring logic — the evaluation is the trust boundary. + +Consider defining more than one evaluation criterion. Optimizing for a single number makes it easy to overfit and silently break other things. A secondary metric or sanity check helps keep the process honest. + +eval_cores: 1 +eval_memory_gb: 1.0 +prereq_command: npm run build + +## Setup + +Install dependencies and prepare the evaluation environment. + +```bash +npm install +``` + +The `prereq_command` is set to `npm run build` which compiles TypeScript source files in `src/` to JavaScript in `lib/` using esbuild, generates type definitions, and builds the man page. This ensures the benchmark measures the compiled output rather than stale artifacts. + +## Run command + +```bash +node test/bench.js 2>&1 | grep -oP 'marked completed in \K[0-9]+' | awk '{printf "METRIC=%.2f\n", 1000000/$1}' +``` + +This command runs the benchmark harness which parses 652 CommonMark test specs 1000 times each using marked, commonmark, and markdown-it. The metric extracts marked's completion time in milliseconds and converts it to operations per second (higher is better). The benchmark also validates correctness by comparing output against expected HTML. + +## Output format + +The benchmark command outputs `METRIC=` where the number represents operations per second (652 specs × 1000 iterations / time in seconds). Higher values indicate better performance. + +## Metric parsing + +The CLI looks for `METRIC=` or `ops_per_sec=` in the output. + +## Ground truth + +Baseline metric represents the throughput of the marked markdown parser on the CommonMark test suite. As of version 18.0.2, marked completes the benchmark in approximately 3290ms (METRIC ≈ 304 ops/sec) with a 97.70% pass rate on CommonMark specs. The benchmark is measured by running 652 CommonMark test specifications through marked's parser 1000 times each, measuring total elapsed time using `process.hrtime.bigint()`. The test suite includes various markdown features: headings, lists, code blocks, links, emphasis, blockquotes, etc. + +Performance can be improved through optimizations like: +- Reducing unnecessary string allocations and copies +- Optimizing regular expressions in the lexer/tokenizer +- Improving parsing algorithms to reduce backtracking +- Caching or precomputing frequently accessed values +- Streamlining the token generation pipeline diff --git a/PROGRAM.md b/PROGRAM.md new file mode 100644 index 0000000000..5ac3390e82 --- /dev/null +++ b/PROGRAM.md @@ -0,0 +1,49 @@ +# Research Program + +cli_version: 0.5.3 +default_branch: master +lead_github_login: alanzabihi +maintainer_github_login: alanzabihi +metric_tolerance: 0.01 +metric_direction: higher_is_better +required_confirmations: 0 +auto_approve: true +min_queue_depth: 5 +assignment_timeout: 24h + +## Goal + +Reduce the time it takes for marked to parse and render the CommonMark test suite to HTML. The baseline is ~3290ms. The goal is to achieve measurable improvements (>1% faster) while maintaining correctness (97.70% pass rate or higher). + +## What you CAN modify + +- `src/**/*.ts` — TypeScript source files (Lexer, Parser, Tokenizer, Renderer, etc.) + +## What you CANNOT modify + +- `PROGRAM.md` — research program specification +- `PREPARE.md` — evaluation setup and trust boundary +- `.polyresearch/` — runtime directory +- `test/bench.js` — benchmark harness +- `test/specs/**` — test specifications +- `package.json` — dependencies and build scripts +- `tsconfig.json` — TypeScript configuration +- `esbuild.config.js` — build configuration + +## Constraints + +- All changes must pass the evaluation harness defined in PREPARE.md. +- Each experiment should be atomic and independently verifiable. +- All else being equal, simpler is better. A small improvement that adds ugly complexity is not worth keeping. Removing code and getting equal or better results is a great outcome. +- If a run crashes, use judgment: fix trivial bugs (typos, missing imports) and re-run. If the idea is fundamentally broken, skip it and move on. +- Document what you tried and what you observed in the attempt summary. + +## Strategy hints + +- Read the full codebase before your first experiment. Understand what you are working with. +- Start with the lowest-hanging fruit. +- Measure before and after every change. +- Read results.tsv to learn from history. Do not repeat approaches that already failed. +- If an approach does not show improvement after reasonable effort, release and move on. +- Try combining ideas from previous near-misses. +- If you are stuck, try something more radical. Re-read the source for new angles. \ No newline at end of file diff --git a/results.tsv b/results.tsv new file mode 100644 index 0000000000..c3f2d1df71 --- /dev/null +++ b/results.tsv @@ -0,0 +1,3 @@ +thesis attempt metric baseline status summary +#1 thesis/1-optimize-regex-compilation-in-rules-ts 318.8800 304.0000 accepted Caching compiled regex patterns in rules.ts improved performance by ~4.9%. Added caching to the edit() function's getRegex() method and to dynamic regex functions in the 'other' object. +#2 thesis/2-reduce-string-concatenation-in-renderer 317.4600 315.0000 discarded Replaced string concatenation with array-based accumulation in list(), table(), link(), and image() methods. Testing showed this approach decreased performance slightly (317.46 vs baseline ~315 ops/sec). Modern JavaScript engines optimize string concatenation with += operator better than array.push() + join() for small to medium strings. The array overhead and join() call add more cost than they save. The thesis assumption that array-based accumulation would improve performance does not hold for this codebase on modern V8. diff --git a/src/Lexer.ts b/src/Lexer.ts index 9efbdd38c8..89d536e477 100644 --- a/src/Lexer.ts +++ b/src/Lexer.ts @@ -110,6 +110,7 @@ export class _Lexer { src = src.replace(other.tabCharGlobal, ' ').replace(other.spaceLine, ''); } + const tokenizer = this.tokenizer; let srcLength = Infinity; while (src) { if (src.length < srcLength) { @@ -133,7 +134,7 @@ export class _Lexer { } // newline - if (token = this.tokenizer.space(src)) { + if (token = tokenizer.space(src)) { src = src.substring(token.raw.length); const lastToken = tokens.at(-1); if (token.raw.length === 1 && lastToken !== undefined) { @@ -147,7 +148,7 @@ export class _Lexer { } // code - if (token = this.tokenizer.code(src)) { + if (token = tokenizer.code(src)) { src = src.substring(token.raw.length); const lastToken = tokens.at(-1); // An indented code block cannot interrupt a paragraph. @@ -162,49 +163,49 @@ export class _Lexer { } // fences - if (token = this.tokenizer.fences(src)) { + if (token = tokenizer.fences(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // heading - if (token = this.tokenizer.heading(src)) { + if (token = tokenizer.heading(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // hr - if (token = this.tokenizer.hr(src)) { + if (token = tokenizer.hr(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // blockquote - if (token = this.tokenizer.blockquote(src)) { + if (token = tokenizer.blockquote(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // list - if (token = this.tokenizer.list(src)) { + if (token = tokenizer.list(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // html - if (token = this.tokenizer.html(src)) { + if (token = tokenizer.html(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // def - if (token = this.tokenizer.def(src)) { + if (token = tokenizer.def(src)) { src = src.substring(token.raw.length); const lastToken = tokens.at(-1); if (lastToken?.type === 'paragraph' || lastToken?.type === 'text') { @@ -222,14 +223,14 @@ export class _Lexer { } // table (gfm) - if (token = this.tokenizer.table(src)) { + if (token = tokenizer.table(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // lheading - if (token = this.tokenizer.lheading(src)) { + if (token = tokenizer.lheading(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; @@ -252,7 +253,7 @@ export class _Lexer { cutSrc = src.substring(0, startIndex + 1); } } - if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) { + if (this.state.top && (token = tokenizer.paragraph(cutSrc))) { const lastToken = tokens.at(-1); if (lastParagraphClipped && lastToken?.type === 'paragraph') { lastToken.raw += (lastToken.raw.endsWith('\n') ? '' : '\n') + token.raw; @@ -268,7 +269,7 @@ export class _Lexer { } // text - if (token = this.tokenizer.text(src)) { + if (token = tokenizer.text(src)) { src = src.substring(token.raw.length); const lastToken = tokens.at(-1); if (lastToken?.type === 'text') { @@ -302,6 +303,8 @@ export class _Lexer { */ inlineTokens(src: string, tokens: Token[] = []): Token[] { this.tokenizer.lexer = this; + const tokenizer = this.tokenizer; + const rules = tokenizer.rules; // String with links masked to avoid interference with em and strong let maskedSrc = src; let match: RegExpExecArray | null = null; @@ -310,26 +313,26 @@ export class _Lexer { if (this.tokens.links) { const links = Object.keys(this.tokens.links); if (links.length > 0) { - while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) !== null) { + while ((match = rules.inline.reflinkSearch.exec(maskedSrc)) !== null) { if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) { maskedSrc = maskedSrc.slice(0, match.index) + '[' + 'a'.repeat(match[0].length - 2) + ']' - + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex); + + maskedSrc.slice(rules.inline.reflinkSearch.lastIndex); } } } } // Mask out escaped characters - while ((match = this.tokenizer.rules.inline.anyPunctuation.exec(maskedSrc)) !== null) { - maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.anyPunctuation.lastIndex); + while ((match = rules.inline.anyPunctuation.exec(maskedSrc)) !== null) { + maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(rules.inline.anyPunctuation.lastIndex); } // Mask out other blocks let offset; - while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) !== null) { + while ((match = rules.inline.blockSkip.exec(maskedSrc)) !== null) { offset = match[2] ? match[2].length : 0; - maskedSrc = maskedSrc.slice(0, match.index + offset) + '[' + 'a'.repeat(match[0].length - offset - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex); + maskedSrc = maskedSrc.slice(0, match.index + offset) + '[' + 'a'.repeat(match[0].length - offset - 2) + ']' + maskedSrc.slice(rules.inline.blockSkip.lastIndex); } // Mask out blocks from extensions @@ -366,28 +369,28 @@ export class _Lexer { } // escape - if (token = this.tokenizer.escape(src)) { + if (token = tokenizer.escape(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // tag - if (token = this.tokenizer.tag(src)) { + if (token = tokenizer.tag(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // link - if (token = this.tokenizer.link(src)) { + if (token = tokenizer.link(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // reflink, nolink - if (token = this.tokenizer.reflink(src, this.tokens.links)) { + if (token = tokenizer.reflink(src, this.tokens.links)) { src = src.substring(token.raw.length); const lastToken = tokens.at(-1); if (token.type === 'text' && lastToken?.type === 'text') { @@ -400,42 +403,42 @@ export class _Lexer { } // em & strong - if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) { + if (token = tokenizer.emStrong(src, maskedSrc, prevChar)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // code - if (token = this.tokenizer.codespan(src)) { + if (token = tokenizer.codespan(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // br - if (token = this.tokenizer.br(src)) { + if (token = tokenizer.br(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // del (gfm) - if (token = this.tokenizer.del(src, maskedSrc, prevChar)) { + if (token = tokenizer.del(src, maskedSrc, prevChar)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // autolink - if (token = this.tokenizer.autolink(src)) { + if (token = tokenizer.autolink(src)) { src = src.substring(token.raw.length); tokens.push(token); continue; } // url (gfm) - if (!this.state.inLink && (token = this.tokenizer.url(src))) { + if (!this.state.inLink && (token = tokenizer.url(src))) { src = src.substring(token.raw.length); tokens.push(token); continue; @@ -458,7 +461,7 @@ export class _Lexer { cutSrc = src.substring(0, startIndex + 1); } } - if (token = this.tokenizer.inlineText(cutSrc)) { + if (token = tokenizer.inlineText(cutSrc)) { src = src.substring(token.raw.length); if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started prevChar = token.raw.slice(-1); diff --git a/src/rules.ts b/src/rules.ts index e90f3e25d0..47babdee18 100644 --- a/src/rules.ts +++ b/src/rules.ts @@ -2,15 +2,20 @@ const noopTest = { exec: () => null } as unknown as RegExp; function edit(regex: string | RegExp, opt = '') { let source = typeof regex === 'string' ? regex : regex.source; + let cachedRegex: RegExp | null = null; const obj = { replace: (name: string | RegExp, val: string | RegExp) => { let valSource = typeof val === 'string' ? val : val.source; valSource = valSource.replace(other.caret, '$1'); source = source.replace(name, valSource); + cachedRegex = null; return obj; }, getRegex: () => { - return new RegExp(source, opt); + if (cachedRegex === null) { + cachedRegex = new RegExp(source, opt); + } + return cachedRegex; }, }; return obj; @@ -77,13 +82,83 @@ export const other = { spaceLine: /^ +$/gm, notSpaceStart: /^\S*/, endingNewline: /\n$/, - listItemRegex: (bull: string) => new RegExp(`^( {0,3}${bull})((?:[\t ][^\\n]*)?(?:\\n|$))`), - nextBulletRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])((?:[ \t][^\\n]*)?(?:\\n|$))`), - hrRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}((?:- *){3,}|(?:_ *){3,}|(?:\\* *){3,})(?:\\n+|$)`), - fencesBeginRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:\`\`\`|~~~)`), - headingBeginRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}#`), - htmlBeginRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}<(?:[a-z].*>|!--)`, 'i'), - blockquoteBeginRegex: (indent: number) => new RegExp(`^ {0,${Math.min(3, indent - 1)}}>`), + listItemRegex: (() => { + const cache = new Map(); + return (bull: string) => { + let regex = cache.get(bull); + if (!regex) { + regex = new RegExp(`^( {0,3}${bull})((?:[\t ][^\\n]*)?(?:\\n|$))`); + cache.set(bull, regex); + } + return regex; + }; + })(), + nextBulletRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:[*+-]|\\d{1,9}[.)])((?:[ \t][^\\n]*)?(?:\\n|$))`); + cache.set(indent, regex); + } + return regex; + }; + })(), + hrRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}((?:- *){3,}|(?:_ *){3,}|(?:\\* *){3,})(?:\\n+|$)`); + cache.set(indent, regex); + } + return regex; + }; + })(), + fencesBeginRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}(?:\`\`\`|~~~)`); + cache.set(indent, regex); + } + return regex; + }; + })(), + headingBeginRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}#`); + cache.set(indent, regex); + } + return regex; + }; + })(), + htmlBeginRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}<(?:[a-z].*>|!--)`, 'i'); + cache.set(indent, regex); + } + return regex; + }; + })(), + blockquoteBeginRegex: (() => { + const cache = new Map(); + return (indent: number) => { + let regex = cache.get(indent); + if (!regex) { + regex = new RegExp(`^ {0,${Math.min(3, indent - 1)}}>`); + cache.set(indent, regex); + } + return regex; + }; + })(), }; /**