markedjs · alanzabihi · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 23, 2026
diff --git a/.polyresearch/result.json b/.polyresearch/result.json
@@ -0,0 +1,6 @@
+{
+  "observation": "improved",
+  "metric": 324.53,
+  "baseline": 319.02,
+  "summary": "Caching tokenizer and rules references in local variables within the blockTokens() and inlineTokens() hot loops reduced property lookups overhead. This eliminated repeated this.tokenizer and this.tokenizer.rules accesses on every iteration, improving throughput by ~1.7%."
+}
diff --git a/.polyresearch/thesis.md b/.polyresearch/thesis.md
@@ -0,0 +1,3 @@
+# Thesis: Optimize hot path in Tokenizer loop
+
+Profile and optimize the main tokenization loop in Tokenizer.ts. Likely candidates: reduce function call overhead by inlining hot helper functions, eliminate redundant property lookups, or cache frequently accessed values.
diff --git a/PREPARE.md b/PREPARE.md
@@ -0,0 +1,46 @@
+# Evaluation Setup
+
+This file is outside the editable surface. It defines how results are judged. Agents cannot modify the evaluator or the scoring logic — the evaluation is the trust boundary.
+
+Consider defining more than one evaluation criterion. Optimizing for a single number makes it easy to overfit and silently break other things. A secondary metric or sanity check helps keep the process honest.
+
+eval_cores: 1
+eval_memory_gb: 1.0
+prereq_command: npm run build
+
+## Setup
+
+Install dependencies and prepare the evaluation environment.
+
+```bash
+npm install
+```
+
+The `prereq_command` is set to `npm run build` which compiles TypeScript source files in `src/` to JavaScript in `lib/` using esbuild, generates type definitions, and builds the man page. This ensures the benchmark measures the compiled output rather than stale artifacts.
+
+## Run command
+
+```bash
+node test/bench.js 2>&1 | grep -oP 'marked completed in \K[0-9]+' | awk '{printf "METRIC=%.2f\n", 1000000/$1}'
+```
+
+This command runs the benchmark harness which parses 652 CommonMark test specs 1000 times each using marked, commonmark, and markdown-it. The metric extracts marked's completion time in milliseconds and converts it to operations per second (higher is better). The benchmark also validates correctness by comparing output against expected HTML.
+
+## Output format
+
+The benchmark command outputs `METRIC=<number>` where the number represents operations per second (652 specs × 1000 iterations / time in seconds). Higher values indicate better performance.
+
+## Metric parsing
+
+The CLI looks for `METRIC=<number>` or `ops_per_sec=<number>` in the output.
+
+## Ground truth
+
+Baseline metric represents the throughput of the marked markdown parser on the CommonMark test suite. As of version 18.0.2, marked completes the benchmark in approximately 3290ms (METRIC ≈ 304 ops/sec) with a 97.70% pass rate on CommonMark specs. The benchmark is measured by running 652 CommonMark test specifications through marked's parser 1000 times each, measuring total elapsed time using `process.hrtime.bigint()`. The test suite includes various markdown features: headings, lists, code blocks, links, emphasis, blockquotes, etc.
+
+Performance can be improved through optimizations like:
+- Reducing unnecessary string allocations and copies
+- Optimizing regular expressions in the lexer/tokenizer
+- Improving parsing algorithms to reduce backtracking
+- Caching or precomputing frequently accessed values
+- Streamlining the token generation pipeline
diff --git a/PROGRAM.md b/PROGRAM.md
@@ -0,0 +1,49 @@
+# Research Program
+
+cli_version: 0.5.3
+default_branch: master
+lead_github_login: alanzabihi
+maintainer_github_login: alanzabihi
+metric_tolerance: 0.01
+metric_direction: higher_is_better
+required_confirmations: 0
+auto_approve: true
+min_queue_depth: 5
+assignment_timeout: 24h
+
+## Goal
+
+Reduce the time it takes for marked to parse and render the CommonMark test suite to HTML. The baseline is ~3290ms. The goal is to achieve measurable improvements (>1% faster) while maintaining correctness (97.70% pass rate or higher).
+
+## What you CAN modify
+
+- `src/**/*.ts` — TypeScript source files (Lexer, Parser, Tokenizer, Renderer, etc.)
+
+## What you CANNOT modify
+
+- `PROGRAM.md` — research program specification
+- `PREPARE.md` — evaluation setup and trust boundary
+- `.polyresearch/` — runtime directory
+- `test/bench.js` — benchmark harness
+- `test/specs/**` — test specifications
+- `package.json` — dependencies and build scripts
+- `tsconfig.json` — TypeScript configuration
+- `esbuild.config.js` — build configuration
+
+## Constraints
+
+- All changes must pass the evaluation harness defined in PREPARE.md.
+- Each experiment should be atomic and independently verifiable.
+- All else being equal, simpler is better. A small improvement that adds ugly complexity is not worth keeping. Removing code and getting equal or better results is a great outcome.
+- If a run crashes, use judgment: fix trivial bugs (typos, missing imports) and re-run. If the idea is fundamentally broken, skip it and move on.
+- Document what you tried and what you observed in the attempt summary.
+
+## Strategy hints
+
+- Read the full codebase before your first experiment. Understand what you are working with.
+- Start with the lowest-hanging fruit.
+- Measure before and after every change.
+- Read results.tsv to learn from history. Do not repeat approaches that already failed.
+- If an approach does not show improvement after reasonable effort, release and move on.
+- Try combining ideas from previous near-misses.
+- If you are stuck, try something more radical. Re-read the source for new angles.
diff --git a/results.tsv b/results.tsv
@@ -0,0 +1,3 @@
+thesis	attempt	metric	baseline	status	summary
+#1	thesis/1-optimize-regex-compilation-in-rules-ts	318.8800	304.0000	accepted	Caching compiled regex patterns in rules.ts improved performance by ~4.9%. Added caching to the edit() function's getRegex() method and to dynamic regex functions in the 'other' object.
+#2	thesis/2-reduce-string-concatenation-in-renderer	317.4600	315.0000	discarded	Replaced string concatenation with array-based accumulation in list(), table(), link(), and image() methods. Testing showed this approach decreased performance slightly (317.46 vs baseline ~315 ops/sec). Modern JavaScript engines optimize string concatenation with += operator better than array.push() + join() for small to medium strings. The array overhead and join() call add more cost than they save. The thesis assumption that array-based accumulation would improve performance does not hold for this codebase on modern V8.
diff --git a/src/Lexer.ts b/src/Lexer.ts
@@ -110,6 +110,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       src = src.replace(other.tabCharGlobal, '    ').replace(other.spaceLine, '');
     }
 
+    const tokenizer = this.tokenizer;
     let srcLength = Infinity;
     while (src) {
       if (src.length < srcLength) {
@@ -133,7 +134,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // newline
-      if (token = this.tokenizer.space(src)) {
+      if (token = tokenizer.space(src)) {
         src = src.substring(token.raw.length);
         const lastToken = tokens.at(-1);
         if (token.raw.length === 1 && lastToken !== undefined) {
@@ -147,7 +148,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // code
-      if (token = this.tokenizer.code(src)) {
+      if (token = tokenizer.code(src)) {
         src = src.substring(token.raw.length);
         const lastToken = tokens.at(-1);
         // An indented code block cannot interrupt a paragraph.
@@ -162,49 +163,49 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // fences
-      if (token = this.tokenizer.fences(src)) {
+      if (token = tokenizer.fences(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // heading
-      if (token = this.tokenizer.heading(src)) {
+      if (token = tokenizer.heading(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // hr
-      if (token = this.tokenizer.hr(src)) {
+      if (token = tokenizer.hr(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // blockquote
-      if (token = this.tokenizer.blockquote(src)) {
+      if (token = tokenizer.blockquote(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // list
-      if (token = this.tokenizer.list(src)) {
+      if (token = tokenizer.list(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // html
-      if (token = this.tokenizer.html(src)) {
+      if (token = tokenizer.html(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // def
-      if (token = this.tokenizer.def(src)) {
+      if (token = tokenizer.def(src)) {
         src = src.substring(token.raw.length);
         const lastToken = tokens.at(-1);
         if (lastToken?.type === 'paragraph' || lastToken?.type === 'text') {
@@ -222,14 +223,14 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // table (gfm)
-      if (token = this.tokenizer.table(src)) {
+      if (token = tokenizer.table(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // lheading
-      if (token = this.tokenizer.lheading(src)) {
+      if (token = tokenizer.lheading(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
@@ -252,7 +253,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
           cutSrc = src.substring(0, startIndex + 1);
         }
       }
-      if (this.state.top && (token = this.tokenizer.paragraph(cutSrc))) {
+      if (this.state.top && (token = tokenizer.paragraph(cutSrc))) {
         const lastToken = tokens.at(-1);
         if (lastParagraphClipped && lastToken?.type === 'paragraph') {
           lastToken.raw += (lastToken.raw.endsWith('\n') ? '' : '\n') + token.raw;
@@ -268,7 +269,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // text
-      if (token = this.tokenizer.text(src)) {
+      if (token = tokenizer.text(src)) {
         src = src.substring(token.raw.length);
         const lastToken = tokens.at(-1);
         if (lastToken?.type === 'text') {
@@ -302,6 +303,8 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
    */
   inlineTokens(src: string, tokens: Token[] = []): Token[] {
     this.tokenizer.lexer = this;
+    const tokenizer = this.tokenizer;
+    const rules = tokenizer.rules;
     // String with links masked to avoid interference with em and strong
     let maskedSrc = src;
     let match: RegExpExecArray | null = null;
@@ -310,26 +313,26 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
     if (this.tokens.links) {
       const links = Object.keys(this.tokens.links);
       if (links.length > 0) {
-        while ((match = this.tokenizer.rules.inline.reflinkSearch.exec(maskedSrc)) !== null) {
+        while ((match = rules.inline.reflinkSearch.exec(maskedSrc)) !== null) {
           if (links.includes(match[0].slice(match[0].lastIndexOf('[') + 1, -1))) {
             maskedSrc = maskedSrc.slice(0, match.index)
               + '[' + 'a'.repeat(match[0].length - 2) + ']'
-              + maskedSrc.slice(this.tokenizer.rules.inline.reflinkSearch.lastIndex);
+              + maskedSrc.slice(rules.inline.reflinkSearch.lastIndex);
           }
         }
       }
     }
 
     // Mask out escaped characters
-    while ((match = this.tokenizer.rules.inline.anyPunctuation.exec(maskedSrc)) !== null) {
-      maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(this.tokenizer.rules.inline.anyPunctuation.lastIndex);
+    while ((match = rules.inline.anyPunctuation.exec(maskedSrc)) !== null) {
+      maskedSrc = maskedSrc.slice(0, match.index) + '++' + maskedSrc.slice(rules.inline.anyPunctuation.lastIndex);
     }
 
     // Mask out other blocks
     let offset;
-    while ((match = this.tokenizer.rules.inline.blockSkip.exec(maskedSrc)) !== null) {
+    while ((match = rules.inline.blockSkip.exec(maskedSrc)) !== null) {
       offset = match[2] ? match[2].length : 0;
-      maskedSrc = maskedSrc.slice(0, match.index + offset) + '[' + 'a'.repeat(match[0].length - offset - 2) + ']' + maskedSrc.slice(this.tokenizer.rules.inline.blockSkip.lastIndex);
+      maskedSrc = maskedSrc.slice(0, match.index + offset) + '[' + 'a'.repeat(match[0].length - offset - 2) + ']' + maskedSrc.slice(rules.inline.blockSkip.lastIndex);
     }
 
     // Mask out blocks from extensions
@@ -366,28 +369,28 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // escape
-      if (token = this.tokenizer.escape(src)) {
+      if (token = tokenizer.escape(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // tag
-      if (token = this.tokenizer.tag(src)) {
+      if (token = tokenizer.tag(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // link
-      if (token = this.tokenizer.link(src)) {
+      if (token = tokenizer.link(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // reflink, nolink
-      if (token = this.tokenizer.reflink(src, this.tokens.links)) {
+      if (token = tokenizer.reflink(src, this.tokens.links)) {
         src = src.substring(token.raw.length);
         const lastToken = tokens.at(-1);
         if (token.type === 'text' && lastToken?.type === 'text') {
@@ -400,42 +403,42 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
       }
 
       // em & strong
-      if (token = this.tokenizer.emStrong(src, maskedSrc, prevChar)) {
+      if (token = tokenizer.emStrong(src, maskedSrc, prevChar)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // code
-      if (token = this.tokenizer.codespan(src)) {
+      if (token = tokenizer.codespan(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // br
-      if (token = this.tokenizer.br(src)) {
+      if (token = tokenizer.br(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // del (gfm)
-      if (token = this.tokenizer.del(src, maskedSrc, prevChar)) {
+      if (token = tokenizer.del(src, maskedSrc, prevChar)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // autolink
-      if (token = this.tokenizer.autolink(src)) {
+      if (token = tokenizer.autolink(src)) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
       }
 
       // url (gfm)
-      if (!this.state.inLink && (token = this.tokenizer.url(src))) {
+      if (!this.state.inLink && (token = tokenizer.url(src))) {
         src = src.substring(token.raw.length);
         tokens.push(token);
         continue;
@@ -458,7 +461,7 @@ export class _Lexer<ParserOutput = string, RendererOutput = string> {
           cutSrc = src.substring(0, startIndex + 1);
         }
       }
-      if (token = this.tokenizer.inlineText(cutSrc)) {
+      if (token = tokenizer.inlineText(cutSrc)) {
         src = src.substring(token.raw.length);
         if (token.raw.slice(-1) !== '_') { // Track prevChar before string of ____ started
           prevChar = token.raw.slice(-1);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Thesis: Optimize hot path in Tokenizer loop

		Profile and optimize the main tokenization loop in Tokenizer.ts. Likely candidates: reduce function call overhead by inlining hot helper functions, eliminate redundant property lookups, or cache frequently accessed values.