diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index a2bd2d08c8..9d49a1323d 100644 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -82,7 +82,8 @@ "Rison Decode", "To Modhex", "From Modhex", - "MIME Decoding" + "MIME Decoding", + "Escape Smart Characters" ] }, { diff --git a/src/core/operations/EscapeSmartCharacters.mjs b/src/core/operations/EscapeSmartCharacters.mjs new file mode 100644 index 0000000000..00e8f3552a --- /dev/null +++ b/src/core/operations/EscapeSmartCharacters.mjs @@ -0,0 +1,152 @@ +/** + * @author min23asdw + * @copyright Crown Copyright 2026 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; + +/** + * Map of smart characters to their plain ASCII equivalents. + */ +const ESCAPE_MAP = { + // Quotation marks + "\u2018": "'", // ' LEFT SINGLE QUOTATION MARK + "\u2019": "'", // ' RIGHT SINGLE QUOTATION MARK + "\u201A": "'", // ‚ SINGLE LOW-9 QUOTATION MARK + "\u201B": "'", // ‛ SINGLE HIGH-REVERSED-9 QUOTATION MARK + "\u201C": "\"", // " LEFT DOUBLE QUOTATION MARK + "\u201D": "\"", // " RIGHT DOUBLE QUOTATION MARK + "\u201E": "\"", // „ DOUBLE LOW-9 QUOTATION MARK + "\u201F": "\"", // ‟ DOUBLE HIGH-REVERSED-9 QUOTATION MARK + "\u2039": "<", // ‹ SINGLE LEFT-POINTING ANGLE QUOTATION MARK + "\u203A": ">", // › SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + "\u00AB": "<<", // « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + "\u00BB": ">>", // » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + + // Primes + "\u2032": "'", // ′ PRIME + "\u2033": "''", // ″ DOUBLE PRIME + "\u2034": "'''", // ‴ TRIPLE PRIME + "\u2035": "'", // ‵ REVERSED PRIME + "\u2036": "''", // ‶ REVERSED DOUBLE PRIME + "\u2037": "'''", // ‷ REVERSED TRIPLE PRIME + "\u2057": "''''", // ⁗ QUADRUPLE PRIME + + // Dashes and hyphens + "\u2010": "-", // ‐ HYPHEN + "\u2011": "-", // ‑ NON-BREAKING HYPHEN + "\u2012": "-", // ‒ FIGURE DASH + "\u2013": "-", // – EN DASH + "\u2014": "--", // — EM DASH + "\u2015": "--", // ― HORIZONTAL BAR + + // Symbols + "\u00A9": "(C)", // © COPYRIGHT SIGN + "\u00AE": "(R)", // ® REGISTERED SIGN + "\u2122": "(TM)", // ™ TRADE MARK SIGN + + // Arrows + "\u2190": "<--", // ← LEFTWARDS ARROW + "\u2192": "-->", // → RIGHTWARDS ARROW + "\u2194": "<->", // ↔ LEFT RIGHT ARROW + "\u21D0": "<==", // ⇐ LEFTWARDS DOUBLE ARROW + "\u21D2": "==>", // ⇒ RIGHTWARDS DOUBLE ARROW + "\u21D4": "<=>", // ⇔ LEFT RIGHT DOUBLE ARROW + + // Dots, bullets, and ellipsis + "\u2022": ".", // • BULLET + "\u2023": ">", // ‣ TRIANGULAR BULLET + "\u2024": ".", // ․ ONE DOT LEADER + "\u2025": "..", // ‥ TWO DOT LEADER + "\u2026": "...", // … HORIZONTAL ELLIPSIS + "\u2027": ".", // ‧ HYPHENATION POINT + + // Misc punctuation + "\u2016": "||", // ‖ DOUBLE VERTICAL LINE + "\u2017": "==", // ‗ DOUBLE LOW LINE + "\u2030": "%0", // ‰ PER MILLE SIGN + "\u2031": "%00", // ‱ PER TEN THOUSAND SIGN + "\u2038": "^", // ‸ CARET + "\u203C": "!!", // ‼ DOUBLE EXCLAMATION MARK + "\u203D": "?!", // ‽ INTERROBANG + "\u2043": "-", // ⁃ HYPHEN BULLET + "\u2044": "/", // ⁄ FRACTION SLASH + "\u2045": "[-", // ⁅ LEFT SQUARE BRACKET WITH QUILL + "\u2046": "-]", // ⁆ RIGHT SQUARE BRACKET WITH QUILL + "\u2047": "??", // ⁇ DOUBLE QUESTION MARK + "\u2048": "?!", // ⁈ QUESTION EXCLAMATION MARK + "\u2049": "!?", // ⁉ EXCLAMATION QUESTION MARK + "\u204E": "*", // ⁎ LOW ASTERISK + "\u204F": ";", // ⁏ REVERSED SEMICOLON + "\u2052": "%", // ⁒ COMMERCIAL MINUS SIGN + "\u2053": "~", // ⁓ SWUNG DASH + "\u2055": "*", // ⁕ FLOWER PUNCTUATION MARK + + // Invisible operators + "\u2062": "*", // INVISIBLE TIMES + "\u2064": "+", // INVISIBLE PLUS + + // Spaces + "\u00A0": " ", // NO-BREAK SPACE +}; + +/** + * Escape Smart Characters operation + */ +class EscapeSmartCharacters extends Operation { + + /** + * EscapeSmartCharacters constructor + */ + constructor() { + super(); + + this.name = "Escape Smart Characters"; + this.module = "Default"; + this.description = "Converts smart characters (quotes, dashes, apostrophes, arrows, copyright signs, ellipses etc.) to their plain ASCII equivalents.

For characters with no obvious ASCII equivalent, the specified action will be applied."; + this.infoURL = "https://wikipedia.org/wiki/Smart_quotes"; + this.inputType = "string"; + this.outputType = "string"; + this.args = [ + { + name: "Unrecognised characters", + type: "option", + value: ["Include", "Remove", "Replace with '.'"] + } + ]; + } + + /** + * @param {string} input + * @param {Object[]} args + * @returns {string} + */ + run(input, args) { + const [unrecognisedAction] = args; + const result = []; + + for (const char of input) { + if (char in ESCAPE_MAP) { + result.push(ESCAPE_MAP[char]); + } else if (char.codePointAt(0) > 0x7F) { + switch (unrecognisedAction) { + case "Remove": + break; + case "Replace with '.'": + result.push("."); + break; + default: + result.push(char); + } + } else { + result.push(char); + } + } + + return result.join(""); + } + +} + +export default EscapeSmartCharacters; diff --git a/tests/operations/index.mjs b/tests/operations/index.mjs index f030349d2a..5f7bd9624b 100644 --- a/tests/operations/index.mjs +++ b/tests/operations/index.mjs @@ -189,6 +189,7 @@ import "./tests/JSONtoYAML.mjs"; import "./tests/YARA.mjs"; import "./tests/ParseCSR.mjs"; import "./tests/XXTEA.mjs"; +import "./tests/EscapeSmartCharacters.mjs"; const testStatus = { allTestsPassing: true, diff --git a/tests/operations/tests/EscapeSmartCharacters.mjs b/tests/operations/tests/EscapeSmartCharacters.mjs new file mode 100644 index 0000000000..08ff81c0fa --- /dev/null +++ b/tests/operations/tests/EscapeSmartCharacters.mjs @@ -0,0 +1,111 @@ +/** + * Escape Smart Characters tests. + * + * @author min23asdw + * @copyright Crown Copyright 2026 + * @license Apache-2.0 + */ + +import TestRegister from "../../lib/TestRegister.mjs"; + +TestRegister.addTests([ + { + name: "Escape Smart Characters: empty input", + input: "", + expectedOutput: "", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: ASCII passthrough", + input: "Hello, World! 123", + expectedOutput: "Hello, World! 123", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: smart quotes and dashes", + input: "\u201C\u201D\u2014\u2018\u2019 \u2192\u00A9\u2026", + expectedOutput: "\"\"--'' -->(C)...", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: guillemets and arrows", + input: "\u00AB\u00BB \u2190\u2194\u21D2", + expectedOutput: "<<>> <--<->==>", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: Remove unrecognised", + input: "\u201CHello\u201D \u2603", + expectedOutput: "\"Hello\" ", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Remove"], + }, + ], + }, + { + name: "Escape Smart Characters: Replace unrecognised with '.'", + input: "\u201CHello\u201D \u2603", + expectedOutput: "\"Hello\" .", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Replace with '.'"], + }, + ], + }, + { + name: "Escape Smart Characters: Include unrecognised", + input: "\u201CHello\u201D \u2603", + expectedOutput: "\"Hello\" \u2603", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: copyright, registered, trademark", + input: "\u00A9 \u00AE \u2122", + expectedOutput: "(C) (R) (TM)", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, + { + name: "Escape Smart Characters: non-breaking space", + input: "hello\u00A0world", + expectedOutput: "hello world", + recipeConfig: [ + { + op: "Escape Smart Characters", + args: ["Include"], + }, + ], + }, +]);