diff --git a/src/builtin.c b/src/builtin.c index 2b2a2d40da..c05dbdf7dc 100644 --- a/src/builtin.c +++ b/src/builtin.c @@ -28,6 +28,7 @@ #endif #include "builtin.h" #include "compile.h" +#include "jq.h" #include "jq_parser.h" #include "bytecode.h" #include "linker.h" @@ -2105,6 +2106,10 @@ int builtins_bind(jq_state *jq, block* bb) { builtins = gen_cbinding(function_list, sizeof(function_list)/sizeof(function_list[0]), builtins); builtins = gen_builtin_list(builtins); + // Collect all function names before block_bind_referenced discards unreferenced ones + jv all_funcs = block_list_funcs(builtins, 0); + jq_set_known_symbols(jq, all_funcs, jv_array()); + *bb = block_bind_referenced(builtins, *bb, OP_IS_CALL_PSEUDO); return nerrors; } diff --git a/src/compile.c b/src/compile.c index 5e64946b3e..45e549a994 100644 --- a/src/compile.c +++ b/src/compile.c @@ -1120,9 +1120,131 @@ make_env(jv env) return jv_copy(r); } +// Standard Levenshtein distance, capped at `max` to avoid O(n^2) work on +// very long strings. Returns max+1 when the distance exceeds the cap +static int levenshtein(const char *a, const char *b, int max) { + int la = (int)strlen(a); + int lb = (int)strlen(b); + // Quick bounds check: if lengths differ by more than max, bail early + if (abs(la - lb) > max) return max + 1; + + // Use two alternating rows + int *row0 = jv_mem_alloc((lb + 1) * sizeof(int)); + int *row1 = jv_mem_alloc((lb + 1) * sizeof(int)); + for (int j = 0; j <= lb; j++) row0[j] = j; + + for (int i = 1; i <= la; i++) { + row1[0] = i; + int row_min = row1[0]; + for (int j = 1; j <= lb; j++) { + int cost = (a[i-1] == b[j-1]) ? 0 : 1; + int v = row0[j-1] + cost; // substitution + int del = row0[j] + 1; // deletion + int ins = row1[j-1] + 1; // insertion + if (del < v) v = del; + if (ins < v) v = ins; + row1[j] = v; + if (v < row_min) row_min = v; + } + if (row_min > max) { + jv_mem_free(row0); + jv_mem_free(row1); + return max + 1; + } + // swap rows + int *tmp = row0; row0 = row1; row1 = tmp; + } + int result = row0[lb]; + jv_mem_free(row0); + jv_mem_free(row1); + return result; +} + +// Collect bound variable names (op==LOADV/STOREV with bound_by set) and +// function binder names (op==CLOSURE_CREATE / CLOSURE_CREATE_C with +// bound_by==self) recursively through subfn and arglist. +// We pass two separate jv arrays: one for variables, one for functions +// (with their arities encoded as "name/arity"). +static void collect_bound_symbols_inner(inst *i, + jv *vars, // "$name" entries + jv *funcs) { // "name/arity" entries + if (!i) return; + for (; i; i = i->next) { + int flags = opcode_describe(i->op)->flags; + if ((flags & OP_HAS_BINDING) && i->bound_by == i && i->symbol) { + // Skip internal break labels + if (i->symbol[0] == '*') goto recurse; + + if (i->op == STOREV) { + // Variable binder + char *vname = jv_mem_alloc(1 + strlen(i->symbol) + 1); + vname[0] = '$'; + strcpy(vname + 1, i->symbol); + // Avoid duplicates (cheap check) + int dup = 0; + for (int k = 0; k < jv_array_length(jv_copy(*vars)); k++) { + jv elem = jv_array_get(jv_copy(*vars), k); + if (strcmp(jv_string_value(elem), vname) == 0) { jv_free(elem); dup = 1; break; } + jv_free(elem); + } + if (!dup) *vars = jv_array_append(*vars, jv_string(vname)); + jv_mem_free(vname); + } else if (i->op == CLOSURE_CREATE || i->op == CLOSURE_CREATE_C) { + // Function binder: count formals + int nformals = i->nformals; + if (nformals < 0) nformals = 0; + jv fname = jv_string_fmt("%s/%d", i->symbol, nformals); + int dup = 0; + for (int k = 0; k < jv_array_length(jv_copy(*funcs)); k++) { + jv elem = jv_array_get(jv_copy(*funcs), k); + if (strcmp(jv_string_value(elem), jv_string_value(fname)) == 0) { jv_free(elem); dup = 1; break; } + jv_free(elem); + } + if (!dup) *funcs = jv_array_append(*funcs, jv_copy(fname)); + jv_free(fname); + } + } + recurse: + collect_bound_symbols_inner(i->subfn.first, vars, funcs); + collect_bound_symbols_inner(i->arglist.first, vars, funcs); + } +} + +// Collect all bound symbols reachable from a block. +static void collect_bound_symbols(block b, jv *vars, jv *funcs) { + *vars = jv_array(); + *funcs = jv_array(); + collect_bound_symbols_inner(b.first, vars, funcs); +} + +// Return a jv_string with the best suggestion for `needle` from `candidates` +// (array of jv strings), or jv_invalid() if nothing is close enough. +// `threshold` is the max edit distance we'll accept. +static jv best_suggestion(const char *needle, jv candidates, int threshold) { + int best_dist = threshold + 1; + jv best = jv_invalid(); + int n = jv_array_length(jv_copy(candidates)); + for (int i = 0; i < n; i++) { + jv cand = jv_array_get(jv_copy(candidates), i); + int d = levenshtein(needle, jv_string_value(cand), threshold); + if (d < best_dist) { + best_dist = d; + jv_free(best); + best = jv_copy(cand); + } + jv_free(cand); + } + return best; +} + // Expands call instructions into a calling sequence static int expand_call_arglist(block* b, jv args, jv *env) { int errors = 0; + + // Pre-pass: collect all bound symbols so we can suggest similar names + jv bound_vars, bound_funcs; + collect_bound_symbols(*b, &bound_vars, &bound_funcs); + block ret = gen_noop(); for (inst* curr; (curr = block_take(b));) { if (opcode_describe(curr->op)->flags & OP_HAS_BINDING) { @@ -1133,12 +1255,46 @@ static int expand_call_arglist(block* b, jv args, jv *env) { curr->op = LOADK; curr->imm.constant = jv_object_get(jv_copy(args), jv_string(curr->symbol)); } else if (!curr->bound_by) { - if (curr->symbol[0] == '*' && curr->symbol[1] >= '1' && curr->symbol[1] <= '3' && curr->symbol[2] == '\0') + if (curr->symbol[0] == '*' && curr->symbol[1] >= '1' && curr->symbol[1] <= '3' && curr->symbol[2] == '\0') { locfile_locate(curr->locfile, curr->source, "jq: error: break used outside labeled control structure"); - else if (curr->op == LOADV) + } else if (curr->op == LOADV) { locfile_locate(curr->locfile, curr->source, "jq: error: $%s is not defined", curr->symbol); - else + // Suggest a similar variable name + char *needle = jv_mem_alloc(1 + strlen(curr->symbol) + 1); + needle[0] = '$'; + strcpy(needle + 1, curr->symbol); + // Merge locally bound vars with globally known vars + jv all_vars = jq_get_known_vars(curr->locfile->jq); + int nlocal = jv_array_length(jv_copy(bound_vars)); + for (int k = 0; k < nlocal; k++) + all_vars = jv_array_append(all_vars, jv_array_get(jv_copy(bound_vars), k)); + jv suggestion = best_suggestion(needle, all_vars, 3); + jv_mem_free(needle); + jv_free(all_vars); + if (jv_is_valid(suggestion)) { + jq_report_error(curr->locfile->jq, + jv_string_fmt("jq: Did you mean: %s?", jv_string_value(suggestion))); + jv_free(suggestion); + } + } else { locfile_locate(curr->locfile, curr->source, "jq: error: %s/%d is not defined", curr->symbol, curr->nactuals); + // Suggest a similar function name + char *needle = jv_mem_alloc(strlen(curr->symbol) + 1 + 20 + 1); + sprintf(needle, "%s/%d", curr->symbol, curr->nactuals); + // Merge locally bound funcs with globally known builtins + jv all_funcs = jq_get_known_funcs(curr->locfile->jq); + int nlocal = jv_array_length(jv_copy(bound_funcs)); + for (int k = 0; k < nlocal; k++) + all_funcs = jv_array_append(all_funcs, jv_array_get(jv_copy(bound_funcs), k)); + jv suggestion = best_suggestion(needle, all_funcs, 3); + jv_mem_free(needle); + jv_free(all_funcs); + if (jv_is_valid(suggestion)) { + jq_report_error(curr->locfile->jq, + jv_string_fmt("jq: Did you mean: %s?", jv_string_value(suggestion))); + jv_free(suggestion); + } + } errors++; // don't process this instruction if it's not well-defined ret = BLOCK(ret, inst_block(curr)); @@ -1208,6 +1364,8 @@ static int expand_call_arglist(block* b, jv args, jv *env) { ret = BLOCK(ret, prelude, inst_block(curr)); } *b = ret; + jv_free(bound_vars); + jv_free(bound_funcs); return errors; } diff --git a/src/execute.c b/src/execute.c index ced1298764..4f6bf2bbc0 100644 --- a/src/execute.c +++ b/src/execute.c @@ -48,6 +48,11 @@ struct jq_state { void *debug_cb_data; jq_msg_cb stderr_cb; void *stderr_cb_data; + + // All known function names ("name/arity") and variable names ("$name"), + // collected before block_bind_referenced discards unreferenced builtins + jv known_funcs; // jv_array of "name/arity" strings + jv known_vars; // jv_array of "$name" strings }; struct closure { @@ -1086,6 +1091,9 @@ jq_state *jq_init(void) { jq->path = jv_null(); jq->value_at_path = jv_null(); + jq->known_funcs = jv_array(); + jq->known_vars = jv_array(); + jq->nomem_handler = NULL; jq->nomem_handler_data = NULL; return jq; @@ -1138,6 +1146,8 @@ void jq_teardown(jq_state **jq) { bytecode_free(old_jq->bc); old_jq->bc = 0; jv_free(old_jq->attrs); + jv_free(old_jq->known_funcs); + jv_free(old_jq->known_vars); jv_mem_free(old_jq); } @@ -1288,6 +1298,21 @@ jv jq_get_attr(jq_state *jq, jv attr) { return jv_object_get(jv_copy(jq->attrs), attr); } +void jq_set_known_symbols(jq_state *jq, jv funcs, jv vars) { + jv_free(jq->known_funcs); + jv_free(jq->known_vars); + jq->known_funcs = funcs; + jq->known_vars = vars; +} + +jv jq_get_known_funcs(jq_state *jq) { + return jv_copy(jq->known_funcs); +} + +jv jq_get_known_vars(jq_state *jq) { + return jv_copy(jq->known_vars); +} + void jq_dump_disassembly(jq_state *jq, int indent) { dump_disassembly(indent, jq->bc); } diff --git a/src/jq.h b/src/jq.h index 8e9a7b8cf8..f31fb12425 100644 --- a/src/jq.h +++ b/src/jq.h @@ -51,6 +51,10 @@ jv jq_get_lib_dirs(jq_state *); void jq_set_attr(jq_state *, jv, jv); jv jq_get_attr(jq_state *, jv); +void jq_set_known_symbols(jq_state *, jv, jv); +jv jq_get_known_funcs(jq_state *); +jv jq_get_known_vars(jq_state *); + /* * We use char * instead of jf for filenames here because filenames * should be in the process' locale's codeset, which may not be UTF-8, diff --git a/tests/jq.test b/tests/jq.test index 9a80341f52..e171fe8d3d 100644 --- a/tests/jq.test +++ b/tests/jq.test @@ -2603,6 +2603,21 @@ try (reduce range(10001) as $_ ([]; [.]) as $x | $x | contains($x)) catch . null "Containment check too deep" +# Tests for issue #3268 for "Did you mean?" hints for undefined symbols +# The %%FAIL tests verify the primary error message is unchanged +# Hint lines ("jq: Did you mean: ...") are tested in shtest + +%%FAIL +to_string +jq: error: to_string/0 is not defined at , line 1, column 1: + to_string + ^^^^^^^^^ + +%%FAIL +1 as $X | $x +jq: error: $x is not defined at , line 1, column 11: + 1 as $X | $x + ^^ # regression test for CVE-2026-43896 reduce range(10000) as $_ ({}; {a: .}) as $x | $x * $x | length null diff --git a/tests/shtest b/tests/shtest index 68705df255..d7558902ba 100755 --- a/tests/shtest +++ b/tests/shtest @@ -893,4 +893,34 @@ if echo '42' | $JQ -f "$d/nul_prog.jq" >/dev/null 2>/dev/null; then exit 1 fi +# Tests for issue #3268: "Did you mean?" hints for undefined symbols + +# Function typo: to_string -> tostring +cat > $d/expected <<'EOF' +jq: error: to_string/0 is not defined at , line 1, column 1: + to_string + ^^^^^^^^^ +jq: Did you mean: tostring/0? +jq: 1 compile error +EOF +$JQ -n 'to_string' > /dev/null 2> $d/out && { + echo "Expected compile error for 'to_string'" + exit 1 +} +diff $d/out $d/expected + +# Variable typo: $x -> $X (case mismatch) +cat > $d/expected <<'EOF' +jq: error: $x is not defined at , line 1, column 11: + 1 as $X | $x + ^^ +jq: Did you mean: $X? +jq: 1 compile error +EOF +$JQ -n '1 as $X | $x' > /dev/null 2> $d/out && { + echo "Expected compile error for '\$x'" + exit 1 +} +diff $d/out $d/expected + exit 0