diff --git a/bootstrap/stage0/jda1 b/bootstrap/stage0/jda1 index 0cf455d..076128c 100755 Binary files a/bootstrap/stage0/jda1 and b/bootstrap/stage0/jda1 differ diff --git a/bootstrap/stage1/jda1.jda b/bootstrap/stage1/jda1.jda index 2ed4f48..fe8ef26 100644 --- a/bootstrap/stage1/jda1.jda +++ b/bootstrap/stage1/jda1.jda @@ -267,6 +267,9 @@ const OP_COPY = 99 const OP_F64X2_BIN = 100 // SLP-fused packed f64×2 binary op (ADDPD/SUBPD/MULPD/DIVPD) const OP_BITCAST = 101 // i64→f64 bitwise reinterpret (no instruction; enables SLP on f64 memory) const OP_I2FX2_BIN = 102 // SLP-fused I2F pair: CVTSI2SD×2 + MOVLHPS + packed fbin (broadcast) +const OP_F64X2_HADD = 103 // SLP reduction: acc += a[i] + a[i+1] via MOVUPD + HADDPD + ADDSD +const OP_F64X4_BIN = 104 // AVX2 4-wide: VMOVUPD ymm + VADDPD/VMULPD/etc ymm +const OP_F64X8_BIN = 105 // AVX-512 8-wide: EVEX VMOVUPD zmm + VADDPD/VMULPD/etc zmm (with CPUID guard) // x86-64 physical register numbers (Intel encoding) const PHYS_RAX = 0 @@ -620,6 +623,12 @@ let g_lea_uses: &i32 = 0 // XMM handoff cache: value_id currently in XMM0/XMM1 (-1 = unknown) let g_xmm0_id: i64 = -1 let g_xmm1_id: i64 = -1 +let g_avx8_cnt: i64 = 0 +let g_avx8_r6: &i64 = 0 +let g_avx8_r7: &i64 = 0 +let g_avx8_iid: &i64 = 0 +let g_avx8_lookup_r6: i64 = 0 +let g_avx8_lookup_r7: i64 = 0 // SLP vectorization: per-val operand0 (base pointer) for adjacency detection let g_slp_o0: &i64 = 0 // SLP broadcast flag: set by slp_find_pair when b operand is broadcast (same id) @@ -4452,7 +4461,7 @@ fn lex_global_fast() -> i32 { // Avoids >> 32 which jda0 may compile as 32-bit shift (nop on x86) tok_set_type(g_dbg_toks_init, g_lex_count, TOK_FLOAT) tok_set_start(g_dbg_toks_init, g_lex_count, int_part) - tok_set_len(g_dbg_toks_init, g_lex_count, (frac_val << 8) | frac_digits) + tok_set_len(g_dbg_toks_init, g_lex_count, frac_val * 256 + frac_digits) tok_set_imm(g_dbg_toks_init, g_lex_count, 0) tok_set_line(g_dbg_toks_init, g_lex_count, g_lex_line) g_lex_count = g_lex_count + 1 @@ -9071,8 +9080,8 @@ fn codegen_primary_inline(toks: &Token, pos: &i64, stab: &StructTable, src: &i8, let fl2_fpk = tok_str_len_at_raw(toks, pos[0]) pos[0] = pos[0] + 1 g_codegen_stype = TYPE_F64 - let fl2_frac = (fl2_fpk >> 8) & 16777215 - let fl2_dig = fl2_fpk & 255 + let fl2_frac = fl2_fpk / 256 + let fl2_dig = fl2_fpk - fl2_frac * 256 let fl2_sc = 1 let fl2_di = 0 loop fl2_di < fl2_dig { @@ -11717,8 +11726,8 @@ fn live_codegen_primary_small_inline(tt: i64, bb: i64) -> i64 { let fl_fpk = tok_str_len_at_raw(g_dbg_toks_init, g_main_pos) g_main_pos = g_main_pos + 1 g_codegen_stype = TYPE_F64 - let fl_frac = (fl_fpk >> 8) & 16777215 - let fl_dig = fl_fpk & 255 + let fl_frac = fl_fpk / 256 + let fl_dig = fl_fpk - fl_frac * 256 let fl_sc = 1 let fl_di = 0 loop fl_di < fl_dig { @@ -14918,9 +14927,248 @@ fn lea_fuse(jfn: &JirFunction) { } } +// slp_try_reduce: scan for FADD(acc, load1) then FADD(id, load2) with adj loads. +// Returns jj (the second FADD index) if fused, -1 otherwise. +fn slp_try_reduce(jfn: &JirFunction, bi: i64, ii: i64, icnt: i64) -> i64 { + let load1 = jfn.blocks[bi].instrs[ii].operand1 + // Skip I2F-wrapped loads: HADD uses MOVUPD (raw bits), not CVTSI2SD + if g_val_op[load1] == OP_I2F { ret 0 - 1 } + let r1 = slp_resolve_load(load1) + if g_val_op[r1] != OP_LOAD_MEM { ret 0 - 1 } + let acc = jfn.blocks[bi].instrs[ii].operand0 + let id1 = jfn.blocks[bi].instrs[ii].id + let jj = ii + 1 + let found = 0 - 1 + let lim = ii + 9 + loop jj < icnt and jj < lim { + let jop = jfn.blocks[bi].instrs[jj].op + let unsafe1 = 0 + if jop == OP_STORE_MEM { unsafe1 = 1 } + if jop == OP_CALL { unsafe1 = 1 } + if jop == OP_CALL_IND { unsafe1 = 1 } + if jop == OP_SYSCALL { unsafe1 = 1 } + if unsafe1 == 1 { jj = icnt } + if unsafe1 == 0 { + if jop == OP_FADD and jfn.blocks[bi].instrs[jj].dead == 0 { + if jfn.blocks[bi].instrs[jj].operand0 == id1 { + let load2 = jfn.blocks[bi].instrs[jj].operand1 + if g_val_op[load2] != OP_I2F { + let r2 = slp_resolve_load(load2) + if g_val_op[r2] == OP_LOAD_MEM { + if slp_adj(load1, load2) == 1 { + found = jj + jj = icnt + } + } + } + } + } + } + if jj < icnt { jj = jj + 1 } + } + if found < 0 { ret 0 - 1 } + // Verify id1 is only used by the second FADD (not elsewhere in the window) + let k = ii + 1 + loop k < found { + if jfn.blocks[bi].instrs[k].dead == 0 { + if jfn.blocks[bi].instrs[k].operand0 == id1 { ret 0 - 1 } + if jfn.blocks[bi].instrs[k].operand1 == id1 { ret 0 - 1 } + } + k = k + 1 + } + // Fuse: rewrite ii as OP_F64X2_HADD + let r1b = slp_resolve_load(load1) + let base_l = g_slp_o0[r1b] + let off_l = g_val_imm[r1b] + let load2f = jfn.blocks[bi].instrs[found].operand1 + let new_id = jfn.blocks[bi].instrs[found].id + jfn.blocks[bi].instrs[ii].op = OP_F64X2_HADD + jfn.blocks[bi].instrs[ii].operand0 = acc + jfn.blocks[bi].instrs[ii].operand1 = base_l + jfn.blocks[bi].instrs[ii].operand2 = off_l + jfn.blocks[bi].instrs[ii].id = new_id + instr_set_dead(&jfn.blocks[bi].instrs[found], 1) + slp_mark_single_use_dead(jfn, load1) + slp_mark_single_use_dead(jfn, load2f) + ret found +} + +fn slp_reduce_pass(jfn: &JirFunction) { + let bi = 0 + loop bi < jfn.block_cnt { + let icnt = jfn.blocks[bi].instr_cnt + let ii = 0 + loop ii < icnt { + let adv = 1 + if jfn.blocks[bi].instrs[ii].op == OP_FADD { + if jfn.blocks[bi].instrs[ii].dead == 0 { + let jj_r = slp_try_reduce(jfn, bi, ii, icnt) + if jj_r >= 0 { ii = jj_r + 1 adv = 0 } + } + } + ii = ii + adv + } + bi = bi + 1 + } +} + +// slp_try_widen4: look for a second OP_F64X2_BIN adjacent to instrs[ii] (same op, +// same bases, off+16). Returns jj if widened, -1 otherwise. +fn slp_try_widen4(jfn: &JirFunction, bi: i64, ii: i64, icnt: i64) -> i64 { + let op0 = jfn.blocks[bi].instrs[ii].imm + let ba = jfn.blocks[bi].instrs[ii].operand0 + let bb2 = jfn.blocks[bi].instrs[ii].operand1 + let oa = jfn.blocks[bi].instrs[ii].operand2 + let ob = jfn.blocks[bi].instrs[ii].operand3 + let jj = ii + 1 + let lim = ii + 9 + loop jj < icnt and jj < lim { + let jop = jfn.blocks[bi].instrs[jj].op + if jop == OP_F64X2_BIN and jfn.blocks[bi].instrs[jj].dead == 0 { + let match1 = 0 + if jfn.blocks[bi].instrs[jj].imm == op0 { match1 = 1 } + if jfn.blocks[bi].instrs[jj].operand0 == ba { match1 = match1 + 0 } + if jfn.blocks[bi].instrs[jj].operand0 != ba { match1 = 0 } + let oa2 = jfn.blocks[bi].instrs[jj].operand2 + let ob2 = jfn.blocks[bi].instrs[jj].operand3 + let oa_ok = 0 + let oa_need = oa + 16 + if oa2 == oa_need { oa_ok = 1 } + if oa_ok == 0 { match1 = 0 } + // Check b side: both broadcast or both non-broadcast, same base, adjacent offsets + if ob == 0 - 1 and ob2 == 0 - 1 { + if jfn.blocks[bi].instrs[jj].operand1 != bb2 { match1 = 0 } + } + if ob != 0 - 1 and ob2 != 0 - 1 { + if jfn.blocks[bi].instrs[jj].operand1 != bb2 { match1 = 0 } + let ob_need = ob + 16 + if ob2 != ob_need { match1 = 0 } + } + if ob == 0 - 1 and ob2 != 0 - 1 { match1 = 0 } + if ob != 0 - 1 and ob2 == 0 - 1 { match1 = 0 } + if match1 == 1 { + // Fuse: rewrite ii as OP_F64X4_BIN, stash extra result ids + let id2 = jfn.blocks[bi].instrs[jj].id + let id3 = jfn.blocks[bi].instrs[jj].itype + jfn.blocks[bi].instrs[ii].op = OP_F64X4_BIN + jfn.blocks[bi].instrs[ii].str_start = id2 + jfn.blocks[bi].instrs[ii].str_len = id3 + instr_set_dead(&jfn.blocks[bi].instrs[jj], 1) + ret jj + } + } + jj = jj + 1 + } + ret 0 - 1 +} + +fn slp_widen4(jfn: &JirFunction) { + let bi = 0 + loop bi < jfn.block_cnt { + let icnt = jfn.blocks[bi].instr_cnt + let ii = 0 + loop ii < icnt { + let adv = 1 + if jfn.blocks[bi].instrs[ii].op == OP_F64X2_BIN { + if jfn.blocks[bi].instrs[ii].dead == 0 { + let jj_r = slp_try_widen4(jfn, bi, ii, icnt) + if jj_r >= 0 { ii = jj_r + 1 adv = 0 } + } + } + ii = ii + adv + } + bi = bi + 1 + } +} + +fn avx8_lookup(id: i64) { + let k = 0 + loop k < g_avx8_cnt { + if g_avx8_iid[k] == id { + g_avx8_lookup_r6 = g_avx8_r6[k] + g_avx8_lookup_r7 = g_avx8_r7[k] + ret ok(0) + } + k = k + 1 + } +} + +fn slp_try_widen8(jfn: &JirFunction, bi: i64, ii: i64, icnt: i64) -> i64 { + let op0 = jfn.blocks[bi].instrs[ii].imm + let ba = jfn.blocks[bi].instrs[ii].operand0 + let bb = jfn.blocks[bi].instrs[ii].operand1 + let oa = jfn.blocks[bi].instrs[ii].operand2 + let ob = jfn.blocks[bi].instrs[ii].operand3 + let jj = ii + 1 + let lim = ii + 9 + loop jj < icnt and jj < lim { + if jfn.blocks[bi].instrs[jj].op == OP_F64X4_BIN { + if jfn.blocks[bi].instrs[jj].dead == 0 { + let m = 1 + if jfn.blocks[bi].instrs[jj].imm != op0 { m = 0 } + if jfn.blocks[bi].instrs[jj].operand0 != ba { m = 0 } + let oa2 = jfn.blocks[bi].instrs[jj].operand2 + if oa2 != oa + 32 { m = 0 } + let ob2 = jfn.blocks[bi].instrs[jj].operand3 + if ob == 0 - 1 and ob2 == 0 - 1 { + if jfn.blocks[bi].instrs[jj].operand1 != bb { m = 0 } + } + if ob != 0 - 1 and ob2 != 0 - 1 { + if jfn.blocks[bi].instrs[jj].operand1 != bb { m = 0 } + if ob2 != ob + 32 { m = 0 } + } + if ob == 0 - 1 and ob2 != 0 - 1 { m = 0 } + if ob != 0 - 1 and ob2 == 0 - 1 { m = 0 } + if m == 1 { + let r4 = jfn.blocks[bi].instrs[jj].id + let r5 = jfn.blocks[bi].instrs[jj].itype + let r6 = jfn.blocks[bi].instrs[jj].str_start + let r7 = jfn.blocks[bi].instrs[jj].str_len + if g_avx8_cnt < 64 { + g_avx8_iid[g_avx8_cnt] = jfn.blocks[bi].instrs[ii].id + g_avx8_r6[g_avx8_cnt] = r6 + g_avx8_r7[g_avx8_cnt] = r7 + g_avx8_cnt = g_avx8_cnt + 1 + } + jfn.blocks[bi].instrs[ii].op = OP_F64X8_BIN + jfn.blocks[bi].instrs[ii].bb_target0 = r4 + jfn.blocks[bi].instrs[ii].bb_target1 = r5 + instr_set_dead(&jfn.blocks[bi].instrs[jj], 1) + ret jj + } + } + } + jj = jj + 1 + } + ret 0 - 1 +} + +fn slp_widen8(jfn: &JirFunction) { + g_avx8_cnt = 0 + let bi = 0 + loop bi < jfn.block_cnt { + let icnt = jfn.blocks[bi].instr_cnt + let ii = 0 + loop ii < icnt { + let adv = 1 + if jfn.blocks[bi].instrs[ii].op == OP_F64X4_BIN { + if jfn.blocks[bi].instrs[ii].dead == 0 { + let jj_r = slp_try_widen8(jfn, bi, ii, icnt) + if jj_r >= 0 { ii = jj_r + 1 adv = 0 } + } + } + ii = ii + adv + } + bi = bi + 1 + } +} + fn lea_fuse_and_slp(jfn: &JirFunction) { lea_fuse(jfn) slp_vectorize(jfn) + slp_reduce_pass(jfn) + slp_widen4(jfn) + slp_widen8(jfn) } fn loop_promote_scan(jfn: &JirFunction) { @@ -15298,6 +15546,9 @@ fn ls_init_globals() { g_ls_free = alloc_pages(1) as &i64 g_spill_free = alloc_pages(1) as &i64 g_slp_o0 = alloc_pages(16) as &i64 + g_avx8_r6 = alloc_pages(1) as &i64 + g_avx8_r7 = alloc_pages(1) as &i64 + g_avx8_iid = alloc_pages(1) as &i64 } fn ls_init_free(pinned_cnt: i64) { @@ -19735,6 +19986,349 @@ fn lower_i2fx2_bin(ins: &Instr, ctx: &LowerCtx, out: &i8, pos: &i64) -> i64 { ret 1 } +// HADDPD xmm_dst, xmm_src: 66 0F 7C /r +fn emit_haddpd(out: &i8, pos: &i64, dst_xmm: i64, src_xmm: i64) { + emit_byte(out, pos, 0x66) + emit_byte(out, pos, 0x0F) + emit_byte(out, pos, 0x7C) + emit_byte(out, pos, 0xC0 + dst_xmm * 8 + src_xmm) +} + +// VMOVUPD ymm_dst, [base+disp]: VEX.256.66.0F 10 /r +fn emit_vmovupd_ymm_mem(out: &i8, pos: &i64, dst_ymm: i64, base: i64, disp: i64) { + let base3 = mod8(base) + let base_hi = base / 8 + if base_hi == 0 { + emit_byte(out, pos, 0xC5) + emit_byte(out, pos, 0xFD) + } + if base_hi != 0 { + emit_byte(out, pos, 0xC4) + emit_byte(out, pos, 0xC1) + emit_byte(out, pos, 0x7D) + } + emit_byte(out, pos, 0x10) + if base3 == 4 { + emit_byte(out, pos, 0x80 + dst_ymm * 8 + 0x04) + emit_byte(out, pos, 0x24) + emit_i32_le(out, pos, disp) + } + if base3 != 4 { + emit_byte(out, pos, 0x80 + dst_ymm * 8 + base3) + emit_i32_le(out, pos, disp) + } +} + +// VADDPD/VMULPD/etc ymm_dst, ymm_src1, ymm_src2: VEX.256.66.0F opcode (all < ymm8) +fn emit_vop_ymm3(out: &i8, pos: &i64, opcode: i64, dst_ymm: i64, src1_ymm: i64, src2_ymm: i64) { + let nv = 15 - src1_ymm + emit_byte(out, pos, 0xC5) + let byte2 = 0x80 + nv * 8 + 4 + 1 + emit_byte(out, pos, byte2) + emit_byte(out, pos, opcode) + emit_byte(out, pos, 0xC0 + dst_ymm * 8 + src2_ymm) +} + +// VEXTRACTF128 xmm_dst, ymm_src, imm8: VEX.256.66.0F3A.W0 19 /r ib +fn emit_vextractf128(out: &i8, pos: &i64, dst_xmm: i64, src_ymm: i64, imm8: i64) { + emit_byte(out, pos, 0xC4) + emit_byte(out, pos, 0xE3) + emit_byte(out, pos, 0x7D) + emit_byte(out, pos, 0x19) + emit_byte(out, pos, 0xC0 + src_ymm * 8 + dst_xmm) + emit_byte(out, pos, imm8) +} + +// VZEROUPPER: VEX.128.0F 77 (C5 F8 77) +fn emit_vzeroupper(out: &i8, pos: &i64) { + emit_byte(out, pos, 0xC5) + emit_byte(out, pos, 0xF8) + emit_byte(out, pos, 0x77) +} + +// VINSERTF128 ymm_dst, ymm_src1, xmm_src2, imm8: VEX.256.66.0F3A.W0 18 /r ib +fn emit_vinsertf128(out: &i8, pos: &i64, dst_ymm: i64, src1_ymm: i64, src2_xmm: i64, imm8: i64) { + let nv = 15 - src1_ymm + emit_byte(out, pos, 0xC4) + emit_byte(out, pos, 0xE3) + let p2 = nv * 8 + 4 + 1 + emit_byte(out, pos, p2) + emit_byte(out, pos, 0x18) + emit_byte(out, pos, 0xC0 + dst_ymm * 8 + src2_xmm) + emit_byte(out, pos, imm8) +} + +// OP_F64X2_HADD: acc += a[i] + a[i+1] via MOVUPD + HADDPD + ADDSD +// operand0=acc, operand1=base_ptr, operand2=off (byte offset) +fn lower_f64x2_hadd(ins: &Instr, ctx: &LowerCtx, out: &i8, pos: &i64) -> i64 { + let base_l = ins.operand1 + let off_l = ins.operand2 + let ra = get_or_load(ctx.ra, out, pos, base_l) + // MOVUPD xmm1, [base + off] → xmm1 = [a[i], a[i+1]] + emit_movupd_xmm_mem(out, pos, 1, ra, off_l) + g_xmm1_id = 0 - 1 + consume_use(ctx, base_l) + // HADDPD xmm1, xmm1 → xmm1[lo] = a[i] + a[i+1] + emit_haddpd(out, pos, 1, 1) + g_xmm1_id = 0 - 1 + // Load acc into xmm0 + f64_load_op(ctx, out, pos, 0, ins.operand0) + consume_use(ctx, ins.operand0) + // ADDSD xmm0, xmm1 → acc + (a[i] + a[i+1]) + emit_sse_rr(out, pos, 0xF2, 0x58, 0, 1) + g_xmm0_id = 0 - 1 + g_xmm1_id = 0 - 1 + let dst = regalloc_alloc(ctx.ra, out, pos, ins.id) + emit_movq_xmm_to_gpr(out, pos, dst, 0) + g_xmm0_id = ins.id + ret 1 +} + +// OP_F64X4_BIN: 4-wide AVX2 binary f64 op +// operand0=base_a, operand1=base_b (or scalar if broadcast), operand2=off_a, operand3=off_b (-1=bcast) +// id=r0, itype=r1, str_start=r2, str_len=r3 +fn lower_f64x4_bin(ins: &Instr, ctx: &LowerCtx, out: &i8, pos: &i64) -> i64 { + let base_a = ins.operand0; let base_b = ins.operand1 + let off_a = ins.operand2; let off_b = ins.operand3 + let fbin_op = ins.imm + // Load ymm0 = [a0,a1,a2,a3] + let ra = get_or_load(ctx.ra, out, pos, base_a) + emit_vmovupd_ymm_mem(out, pos, 0, ra, off_a) + consume_use(ctx, base_a) + // Load ymm1 = b side + if off_b == 0 - 1 { + // broadcast scalar base_b into ymm1 + f64_load_op(ctx, out, pos, 1, base_b) + // UNPCKLPD xmm1, xmm1: 66 0F 14 C9 + emit_byte(out, pos, 0x66) + emit_byte(out, pos, 0x0F) + emit_byte(out, pos, 0x14) + emit_byte(out, pos, 0xC9) + g_xmm1_id = 0 - 1 + // VINSERTF128 ymm1, ymm1, xmm1, 1 — duplicate to high half + emit_vinsertf128(out, pos, 1, 1, 1, 1) + consume_use(ctx, base_b) + } + if off_b != 0 - 1 { + let rb = get_or_load(ctx.ra, out, pos, base_b) + emit_vmovupd_ymm_mem(out, pos, 1, rb, off_b) + consume_use(ctx, base_b) + } + // VADDPD/etc ymm0, ymm0, ymm1 + emit_vop_ymm3(out, pos, fbin_op, 0, 0, 1) + // Extract high 128 bits before VZEROUPPER + emit_vextractf128(out, pos, 1, 0, 1) + g_xmm1_id = 0 - 1 + emit_vzeroupper(out, pos) + // Extract 4 scalar results + let dst0 = regalloc_alloc(ctx.ra, out, pos, ins.id) + emit_movq_xmm_to_gpr(out, pos, dst0, 0) + g_xmm0_id = ins.id + emit_movhlps(out, pos, 0, 0) + g_xmm0_id = 0 - 1 + let dst1 = regalloc_alloc(ctx.ra, out, pos, ins.itype) + emit_movq_xmm_to_gpr(out, pos, dst1, 0) + g_xmm0_id = ins.itype + let dst2 = regalloc_alloc(ctx.ra, out, pos, ins.str_start) + emit_movq_xmm_to_gpr(out, pos, dst2, 1) + g_xmm1_id = ins.str_start + emit_movhlps(out, pos, 1, 1) + g_xmm1_id = 0 - 1 + let dst3 = regalloc_alloc(ctx.ra, out, pos, ins.str_len) + emit_movq_xmm_to_gpr(out, pos, dst3, 1) + g_xmm1_id = ins.str_len + ret 1 +} + +// VMOVUPD zmm_dst, [base+disp32]: EVEX.512.66.0F W1 0x10 +fn emit_evex_vmovupd_zmm_mem(out: &i8, pos: &i64, dst: i64, base: i64, disp: i64) { + emit_evex(out, pos, 1, 1, 0, 2) + emit_byte(out, pos, 0x10) + let b3 = mod8(base) + if b3 == 4 { + emit_byte(out, pos, 0x80 + dst * 8 + 4) + emit_byte(out, pos, 0x24) + } + if b3 != 4 { + emit_byte(out, pos, 0x80 + dst * 8 + b3) + } + emit_i32_le(out, pos, disp) +} + +// EVEX.512.66.0F opcode zmm_dst, zmm_src1, zmm_src2 (3-register form, all < zmm8) +fn emit_evex_vop_zmm3(out: &i8, pos: &i64, opc: i64, d: i64, s1: i64, s2: i64) { + emit_evex(out, pos, 1, 1, s1, 2) + emit_byte(out, pos, opc) + emit_byte(out, pos, 0xC0 + d * 8 + s2) +} + +// VPBROADCASTSD zmm_dst, xmm_src: EVEX.512.66.0F38 W1 0x19 +fn emit_evex_vbroadcastsd(out: &i8, pos: &i64, dst_zmm: i64, src_xmm: i64) { + emit_evex(out, pos, 2, 1, 0, 2) + emit_byte(out, pos, 0x19) + emit_byte(out, pos, 0xC0 + dst_zmm * 8 + src_xmm) +} + +// VEXTRACTF64X4 ymm_dst, zmm_src, imm8: EVEX.512.66.0F3A W1 0x1B +fn emit_vextractf64x4(out: &i8, pos: &i64, dst_ymm: i64, src_zmm: i64, imm8: i64) { + emit_evex(out, pos, 3, 1, 0, 2) + emit_byte(out, pos, 0x1B) + emit_byte(out, pos, 0xC0 + src_zmm * 8 + dst_ymm) + emit_byte(out, pos, imm8) +} + +// Push 4 scalar f64 results to stack from xmm0 (low pair) and xmm1 (high pair after VEXTRACTF128) +fn emit_push4_f64_from_xmm01(out: &i8, pos: &i64) { + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 0) + emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 0) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2) + emit_push_r(out, pos, PHYS_RAX) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 1) + emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 1) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2) + emit_push_r(out, pos, PHYS_RAX) +} + +// OP_F64X8_BIN: AVX-512 8-wide f64 op with runtime CPUID guard, AVX2 fallback. +// id/itype/str_start/str_len = r0-r3; bb_target0/bb_target1 = r4/r5; g_avx8_r6/r7 = r6/r7 +fn lower_f64x8_bin(ins: &Instr, ctx: &LowerCtx, out: &i8, pos: &i64) -> i64 { + let base_a = ins.operand0 + let base_b = ins.operand1 + let off_a = ins.operand2 + let off_b = ins.operand3 + let fbin_op = ins.imm + avx8_lookup(ins.id) + let r6 = g_avx8_lookup_r6 + let r7 = g_avx8_lookup_r7 + // Load base pointers into physical registers before save_pool + let ra = get_or_load(ctx.ra, out, pos, base_a) + let rb = 0 + if off_b != 0 - 1 { rb = get_or_load(ctx.ra, out, pos, base_b) } + // Save caller-saved pool regs; push RBX (CPUID clobbers it, not in pool) + emit_save_pool(ctx.ra, out, pos) + emit_push_r(out, pos, PHYS_RBX) + // CPUID leaf 7: EBX bit 16 = AVX-512F + emit_byte(out, pos, 0x31); emit_byte(out, pos, 0xC9) // xor ecx, ecx + emit_byte(out, pos, 0xB8); emit_byte(out, pos, 0x07) // mov eax, 7 + emit_byte(out, pos, 0x00); emit_byte(out, pos, 0x00); emit_byte(out, pos, 0x00) + emit_byte(out, pos, 0x0F); emit_byte(out, pos, 0xA2) // cpuid + emit_byte(out, pos, 0x0F); emit_byte(out, pos, 0xBA) // bt ebx, 16 + emit_byte(out, pos, 0xE3); emit_byte(out, pos, 0x10) + emit_pop_r(out, pos, PHYS_RBX) + // JNC fallback (0F 83 rel32) + let jnc_off = pos[0] + emit_byte(out, pos, 0x0F); emit_byte(out, pos, 0x83) + emit_i32_le(out, pos, 0) + + // ===== AVX-512 path ===== + emit_restore_pool(ctx.ra, out, pos) + emit_evex_vmovupd_zmm_mem(out, pos, 0, ra, off_a) + if off_b == 0 - 1 { + // broadcast scalar base_b into zmm1 via xmm2 + let rb2 = get_or_load(ctx.ra, out, pos, base_b) + emit_movq_gpr_to_xmm(out, pos, 2, rb2) + emit_evex_vbroadcastsd(out, pos, 1, 2) + } + if off_b != 0 - 1 { + emit_evex_vmovupd_zmm_mem(out, pos, 1, rb, off_b) + } + emit_evex_vop_zmm3(out, pos, fbin_op, 0, 0, 1) + emit_vextractf64x4(out, pos, 1, 0, 1) + emit_vzeroupper(out, pos) + // Push r0..r3 from xmm0(low), movhlps xmm2, vextractf128 xmm2 + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 0); emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 0) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + emit_vextractf128(out, pos, 2, 0, 1) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 2) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + // Push r4..r7 from xmm1(low), movhlps xmm2, vextractf128 xmm2 + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 1); emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 1) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + emit_vextractf128(out, pos, 2, 1, 1) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + emit_movhlps(out, pos, 2, 2) + emit_movq_xmm_to_gpr(out, pos, PHYS_RAX, 2); emit_push_r(out, pos, PHYS_RAX) + // JMP done (E9 rel32) + let jmp_off = pos[0] + emit_jmp_rel32(out, pos, 0) + + // ===== AVX2 fallback path ===== + let fb_off = pos[0] + let jnc_rel = fb_off - (jnc_off + 6) + poke_byte(out, jnc_off + 2, i32_b0(jnc_rel)) + poke_byte(out, jnc_off + 3, i32_b1(jnc_rel)) + poke_byte(out, jnc_off + 4, i32_b2(jnc_rel)) + poke_byte(out, jnc_off + 5, i32_b3(jnc_rel)) + emit_restore_pool(ctx.ra, out, pos) + // First half: [base_a + off_a .. off_a+24] (4 doubles = 32 bytes) + emit_vmovupd_ymm_mem(out, pos, 0, ra, off_a) + if off_b == 0 - 1 { + let rb3 = get_or_load(ctx.ra, out, pos, base_b) + emit_movq_gpr_to_xmm(out, pos, 1, rb3) + emit_byte(out, pos, 0x66); emit_byte(out, pos, 0x0F) + emit_byte(out, pos, 0x14); emit_byte(out, pos, 0xC9) + emit_vinsertf128(out, pos, 1, 1, 1, 1) + } + if off_b != 0 - 1 { emit_vmovupd_ymm_mem(out, pos, 1, rb, off_b) } + emit_vop_ymm3(out, pos, fbin_op, 0, 0, 1) + emit_vextractf128(out, pos, 1, 0, 1) + emit_vzeroupper(out, pos) + emit_push4_f64_from_xmm01(out, pos) + // Second half: [base_a + off_a+32 .. off_a+56] (4 doubles) + let off_a2 = off_a + 32 + emit_vmovupd_ymm_mem(out, pos, 0, ra, off_a2) + if off_b == 0 - 1 { + let rb4 = get_or_load(ctx.ra, out, pos, base_b) + emit_movq_gpr_to_xmm(out, pos, 1, rb4) + emit_byte(out, pos, 0x66); emit_byte(out, pos, 0x0F) + emit_byte(out, pos, 0x14); emit_byte(out, pos, 0xC9) + emit_vinsertf128(out, pos, 1, 1, 1, 1) + } + if off_b != 0 - 1 { + let off_b2 = off_b + 32 + emit_vmovupd_ymm_mem(out, pos, 1, rb, off_b2) + } + emit_vop_ymm3(out, pos, fbin_op, 0, 0, 1) + emit_vextractf128(out, pos, 1, 0, 1) + emit_vzeroupper(out, pos) + emit_push4_f64_from_xmm01(out, pos) + + // ===== Merge: pop r7..r0, allocate result registers ===== + let done_off = pos[0] + let jmp_rel = done_off - (jmp_off + 5) + poke_byte(out, jmp_off + 1, i32_b0(jmp_rel)) + poke_byte(out, jmp_off + 2, i32_b1(jmp_rel)) + poke_byte(out, jmp_off + 3, i32_b2(jmp_rel)) + poke_byte(out, jmp_off + 4, i32_b3(jmp_rel)) + // Pop in reverse push order (r7 was pushed last → at top of stack) + let dst7 = regalloc_alloc(ctx.ra, out, pos, r7) + emit_pop_r(out, pos, dst7) + let dst6 = regalloc_alloc(ctx.ra, out, pos, r6) + emit_pop_r(out, pos, dst6) + let dst5 = regalloc_alloc(ctx.ra, out, pos, ins.bb_target1) + emit_pop_r(out, pos, dst5) + let dst4 = regalloc_alloc(ctx.ra, out, pos, ins.bb_target0) + emit_pop_r(out, pos, dst4) + let dst3 = regalloc_alloc(ctx.ra, out, pos, ins.str_len) + emit_pop_r(out, pos, dst3) + let dst2 = regalloc_alloc(ctx.ra, out, pos, ins.str_start) + emit_pop_r(out, pos, dst2) + let dst1 = regalloc_alloc(ctx.ra, out, pos, ins.itype) + emit_pop_r(out, pos, dst1) + let dst0 = regalloc_alloc(ctx.ra, out, pos, ins.id) + emit_pop_r(out, pos, dst0) + consume_use(ctx, base_a) + consume_use(ctx, base_b) + g_xmm0_id = 0 - 1 + g_xmm1_id = 0 - 1 + ret 1 +} + fn sse_pack_xmm(ctx: &LowerCtx, out: &i8, pos: &i64, xmm_dst: i64, lo_id: i64, hi_id: i64) { // Pack lo_id into xmm_dst low lane, hi_id into xmm_dst high lane. // If lo and hi come from adjacent LOAD_MEM, use single MOVUPD. @@ -19968,6 +20562,9 @@ fn lower_instr_float(ins: &Instr, ctx: &LowerCtx, out: &i8, pos: &i64) -> i64 { // SLP-fused packed f64×2 binary op if op == OP_F64X2_BIN { ret lower_f64x2_bin(ins, ctx, out, pos) } if op == OP_I2FX2_BIN { ret lower_i2fx2_bin(ins, ctx, out, pos) } + if op == OP_F64X2_HADD { ret lower_f64x2_hadd(ins, ctx, out, pos) } + if op == OP_F64X4_BIN { ret lower_f64x4_bin(ins, ctx, out, pos) } + if op == OP_F64X8_BIN { ret lower_f64x8_bin(ins, ctx, out, pos) } // Unary float ops: FNEG, FSQRT if op == OP_FNEG { @@ -24299,6 +24896,18 @@ fn lower_fn_mark_uses_instr(ctx: &LowerCtx, ins: &Instr) { mark_use(ctx, o0) mark_use(ctx, o1) } + if op == OP_F64X2_HADD { + mark_use(ctx, o0) // acc + mark_use(ctx, o1) // base ptr + } + if op == OP_F64X4_BIN { + mark_use(ctx, o0) // base_a + mark_use(ctx, o1) // base_b or scalar + } + if op == OP_F64X8_BIN { + mark_use(ctx, o0) + mark_use(ctx, o1) + } if op == OP_FCMP_LT or op == OP_FCMP_GT or op == OP_FCMP_EQ or op == OP_FPOW { mark_use(ctx, o0) mark_use(ctx, o1) @@ -24459,6 +25068,51 @@ fn lower_fn_collect_uses(jfn: &JirFunction, ctx: &LowerCtx) { } } } + if jfn.blocks[bi].instrs[ii].op == OP_F64X4_BIN { + let r1f = jfn.blocks[bi].instrs[ii].itype + if r1f >= 0 and r1f < 8192 { + if g_val_first_def[r1f] == -1 { g_val_first_def[r1f] = bi * 128 + ii } + } + let r2f = jfn.blocks[bi].instrs[ii].str_start + if r2f >= 0 and r2f < 8192 { + if g_val_first_def[r2f] == -1 { g_val_first_def[r2f] = bi * 128 + ii } + } + let r3f = jfn.blocks[bi].instrs[ii].str_len + if r3f >= 0 and r3f < 8192 { + if g_val_first_def[r3f] == -1 { g_val_first_def[r3f] = bi * 128 + ii } + } + } + if jfn.blocks[bi].instrs[ii].op == OP_F64X8_BIN { + let r1e = jfn.blocks[bi].instrs[ii].itype + if r1e >= 0 and r1e < 8192 { + if g_val_first_def[r1e] == -1 { g_val_first_def[r1e] = bi * 128 + ii } + } + let r2e = jfn.blocks[bi].instrs[ii].str_start + if r2e >= 0 and r2e < 8192 { + if g_val_first_def[r2e] == -1 { g_val_first_def[r2e] = bi * 128 + ii } + } + let r3e = jfn.blocks[bi].instrs[ii].str_len + if r3e >= 0 and r3e < 8192 { + if g_val_first_def[r3e] == -1 { g_val_first_def[r3e] = bi * 128 + ii } + } + let r4e = jfn.blocks[bi].instrs[ii].bb_target0 + if r4e >= 0 and r4e < 8192 { + if g_val_first_def[r4e] == -1 { g_val_first_def[r4e] = bi * 128 + ii } + } + let r5e = jfn.blocks[bi].instrs[ii].bb_target1 + if r5e >= 0 and r5e < 8192 { + if g_val_first_def[r5e] == -1 { g_val_first_def[r5e] = bi * 128 + ii } + } + avx8_lookup(jfn.blocks[bi].instrs[ii].id) + let r6e = g_avx8_lookup_r6 + if r6e >= 0 and r6e < 8192 { + if g_val_first_def[r6e] == -1 { g_val_first_def[r6e] = bi * 128 + ii } + } + let r7e = g_avx8_lookup_r7 + if r7e >= 0 and r7e < 8192 { + if g_val_first_def[r7e] == -1 { g_val_first_def[r7e] = bi * 128 + ii } + } + } } ii = ii + 1 } diff --git a/tests/conformance/stage1/pass/simd_avx2_basic.expected b/tests/conformance/stage1/pass/simd_avx2_basic.expected new file mode 100644 index 0000000..209e3ef --- /dev/null +++ b/tests/conformance/stage1/pass/simd_avx2_basic.expected @@ -0,0 +1 @@ +20 diff --git a/tests/conformance/stage1/pass/simd_avx2_basic.jda b/tests/conformance/stage1/pass/simd_avx2_basic.jda new file mode 100644 index 0000000..87040c3 --- /dev/null +++ b/tests/conformance/stage1/pass/simd_avx2_basic.jda @@ -0,0 +1,31 @@ +// Tests OP_F64X4_BIN: 4-wide AVX2 VMOVUPD+VMULPD via paired SLP widening +fn dot4(a: &i64, b: &i64) -> f64 { + let a0: f64 = f64_bitcast(a[0]) + let a1: f64 = f64_bitcast(a[1]) + let a2: f64 = f64_bitcast(a[2]) + let a3: f64 = f64_bitcast(a[3]) + let b0: f64 = f64_bitcast(b[0]) + let b1: f64 = f64_bitcast(b[1]) + let b2: f64 = f64_bitcast(b[2]) + let b3: f64 = f64_bitcast(b[3]) + let r0: f64 = a0 * b0 + let r1: f64 = a1 * b1 + let r2: f64 = a2 * b2 + let r3: f64 = a3 * b3 + ret r0 + r1 + r2 + r3 +} + +fn main() { + let a: &i64 = alloc_pages(1) + let b: &i64 = alloc_pages(1) + a[0] = f64_from_int(1) + a[1] = f64_from_int(2) + a[2] = f64_from_int(3) + a[3] = f64_from_int(4) + b[0] = f64_from_int(2) + b[1] = f64_from_int(2) + b[2] = f64_from_int(2) + b[3] = f64_from_int(2) + let r = dot4(a, b) + print_int(f64_to_int(r)) +} diff --git a/tests/conformance/stage1/pass/simd_avx512_dot8.expected b/tests/conformance/stage1/pass/simd_avx512_dot8.expected new file mode 100644 index 0000000..ea70ce0 --- /dev/null +++ b/tests/conformance/stage1/pass/simd_avx512_dot8.expected @@ -0,0 +1 @@ +72 diff --git a/tests/conformance/stage1/pass/simd_avx512_dot8.jda b/tests/conformance/stage1/pass/simd_avx512_dot8.jda new file mode 100644 index 0000000..7b0499d --- /dev/null +++ b/tests/conformance/stage1/pass/simd_avx512_dot8.jda @@ -0,0 +1,51 @@ +// Tests OP_F64X8_BIN: 8-wide AVX-512 VMOVUPD+VMULPD via slp_widen8 (with AVX2 fallback) +fn dot8(a: &i64, b: &i64) -> f64 { + let a0: f64 = f64_bitcast(a[0]) + let a1: f64 = f64_bitcast(a[1]) + let a2: f64 = f64_bitcast(a[2]) + let a3: f64 = f64_bitcast(a[3]) + let a4: f64 = f64_bitcast(a[4]) + let a5: f64 = f64_bitcast(a[5]) + let a6: f64 = f64_bitcast(a[6]) + let a7: f64 = f64_bitcast(a[7]) + let b0: f64 = f64_bitcast(b[0]) + let b1: f64 = f64_bitcast(b[1]) + let b2: f64 = f64_bitcast(b[2]) + let b3: f64 = f64_bitcast(b[3]) + let b4: f64 = f64_bitcast(b[4]) + let b5: f64 = f64_bitcast(b[5]) + let b6: f64 = f64_bitcast(b[6]) + let b7: f64 = f64_bitcast(b[7]) + let r0: f64 = a0 * b0 + let r1: f64 = a1 * b1 + let r2: f64 = a2 * b2 + let r3: f64 = a3 * b3 + let r4: f64 = a4 * b4 + let r5: f64 = a5 * b5 + let r6: f64 = a6 * b6 + let r7: f64 = a7 * b7 + ret r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 +} + +fn main() { + let a: &i64 = alloc_pages(1) + let b: &i64 = alloc_pages(1) + a[0] = f64_from_int(1) + a[1] = f64_from_int(2) + a[2] = f64_from_int(3) + a[3] = f64_from_int(4) + a[4] = f64_from_int(5) + a[5] = f64_from_int(6) + a[6] = f64_from_int(7) + a[7] = f64_from_int(8) + b[0] = f64_from_int(2) + b[1] = f64_from_int(2) + b[2] = f64_from_int(2) + b[3] = f64_from_int(2) + b[4] = f64_from_int(2) + b[5] = f64_from_int(2) + b[6] = f64_from_int(2) + b[7] = f64_from_int(2) + let r = dot8(a, b) + print_int(f64_to_int(r)) +} diff --git a/tests/conformance/stage1/pass/simd_hadd_basic.expected b/tests/conformance/stage1/pass/simd_hadd_basic.expected new file mode 100644 index 0000000..f599e28 --- /dev/null +++ b/tests/conformance/stage1/pass/simd_hadd_basic.expected @@ -0,0 +1 @@ +10 diff --git a/tests/conformance/stage1/pass/simd_hadd_basic.jda b/tests/conformance/stage1/pass/simd_hadd_basic.jda new file mode 100644 index 0000000..bdfb786 --- /dev/null +++ b/tests/conformance/stage1/pass/simd_hadd_basic.jda @@ -0,0 +1,26 @@ +// Tests OP_F64X2_HADD: acc += a[i] + a[i+1] using raw f64-bit memory +fn sum_pairs_f64(a: &i64, n: i64) -> f64 { + let acc: f64 = 0.0 + let i = 0 + loop i < n { + acc = acc + f64_bitcast(a[i]) + acc = acc + f64_bitcast(a[i + 1]) + i = i + 2 + } + ret acc +} + +fn main() { + let arr: &i64 = alloc_pages(1) + let v1: f64 = f64_from_int(1) + let v2: f64 = f64_from_int(2) + let v3: f64 = f64_from_int(3) + let v4: f64 = f64_from_int(4) + arr[0] = v1 + arr[1] = v2 + arr[2] = v3 + arr[3] = v4 + let r = sum_pairs_f64(arr, 4) + let ri = f64_to_int(r) + print_int(ri) +}