Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -734,6 +734,19 @@ break; } + case ISD::ADD: { + unsigned NLZ = DemandedBits.countLeadingZeros(); + APInt DemandedFromOps = APInt::getLowBitsSet(BitWidth, BitWidth - NLZ); + + RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + if (DemandedFromOps.isSubsetOf(RHSKnown.Zero)) + return Op.getOperand(0); + + LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (DemandedFromOps.isSubsetOf(LHSKnown.Zero)) + return Op.getOperand(1); + break; + } case ISD::AND: { LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); Index: llvm/test/CodeGen/AArch64/srem-seteq.ll =================================================================== --- llvm/test/CodeGen/AArch64/srem-seteq.ll +++ llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -85,10 +85,11 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #28087 ; CHECK-NEXT: mov w9, #4680 -; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: lsl w10, w8, #15 -; CHECK-NEXT: bfxil w10, w8, #1, #15 -; CHECK-NEXT: cmp w9, w10, uxth +; CHECK-NEXT: mul w8, w0, w8 +; CHECK-NEXT: add w10, w8, w9 +; CHECK-NEXT: lsl w8, w8, #15 +; CHECK-NEXT: bfxil w8, w10, #1, #15 +; CHECK-NEXT: cmp w9, w8, uxth ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret %srem = srem i16 %X, 14 Index: llvm/test/CodeGen/AMDGPU/udiv64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udiv64.ll +++ llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -696,6 +696,7 @@ ; GCN-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v2, v2 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v0 ; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_mul_lo_u32 v3, s2, v2 ; GCN-NEXT: v_mul_hi_u32 v4, s2, v1 @@ -751,34 +752,33 @@ ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc +; GCN-NEXT: v_mul_hi_u32 v6, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc -; GCN-NEXT: v_add_i32_e32 v1, vcc, 0, v1 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v0, v1 +; GCN-NEXT: v_mul_lo_u32 v9, s0, v2 ; GCN-NEXT: v_add_i32_e32 v4, vcc, 2, v1 -; GCN-NEXT: v_mul_lo_u32 v10, v0, v1 +; GCN-NEXT: v_readfirstlane_b32 s0, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v0, v1 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, 1, v1 -; GCN-NEXT: v_addc_u32_e32 v9, vcc, 0, v2, vcc -; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 -; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v10 -; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v6, vcc -; GCN-NEXT: v_sub_i32_e32 v7, vcc, v3, v0 +; GCN-NEXT: v_add_i32_e32 v7, vcc, 1, v1 +; GCN-NEXT: v_addc_u32_e32 v8, vcc, 0, v2, vcc +; GCN-NEXT: v_add_i32_e32 v9, vcc, s0, v9 +; GCN-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_subb_u32_e32 v6, vcc, 0, v9, vcc +; GCN-NEXT: v_sub_i32_e32 v9, vcc, v3, v0 ; GCN-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v6, vcc -; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v7, v0 -; GCN-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc +; GCN-NEXT: v_cmp_ge_u32_e32 vcc, v9, v0 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v10 ; GCN-NEXT: v_cmp_ge_u32_e64 s[0:1], v3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v7, -1, v7, vcc +; GCN-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7 +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9 ; GCN-NEXT: v_cndmask_b32_e64 v0, -1, v0, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc +; GCN-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v4, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc ; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[0:1] ; GCN-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4 ; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 Index: llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll +++ llvm/test/CodeGen/Mips/urem-seteq-illegal-types.ll @@ -151,8 +151,8 @@ ; MIPSEL-NEXT: lui $1, 52741 ; MIPSEL-NEXT: ori $1, $1, 40665 ; MIPSEL-NEXT: multu $6, $1 -; MIPSEL-NEXT: mfhi $2 -; MIPSEL-NEXT: mflo $3 +; MIPSEL-NEXT: mflo $2 +; MIPSEL-NEXT: mfhi $3 ; MIPSEL-NEXT: multu $5, $1 ; MIPSEL-NEXT: mfhi $7 ; MIPSEL-NEXT: mflo $8 @@ -161,35 +161,35 @@ ; MIPSEL-NEXT: multu $6, $9 ; MIPSEL-NEXT: mflo $10 ; MIPSEL-NEXT: mfhi $11 -; MIPSEL-NEXT: addu $2, $8, $2 -; MIPSEL-NEXT: addu $12, $10, $2 -; MIPSEL-NEXT: sltu $2, $2, $8 -; MIPSEL-NEXT: addu $2, $7, $2 -; MIPSEL-NEXT: sltu $7, $12, $10 -; MIPSEL-NEXT: sll $8, $12, 31 -; MIPSEL-NEXT: srl $10, $12, 1 -; MIPSEL-NEXT: sll $12, $3, 1 -; MIPSEL-NEXT: srl $3, $3, 1 +; MIPSEL-NEXT: addu $3, $8, $3 +; MIPSEL-NEXT: sll $12, $3, 31 +; MIPSEL-NEXT: addu $13, $10, $3 +; MIPSEL-NEXT: sltu $3, $3, $8 +; MIPSEL-NEXT: srl $8, $2, 1 +; MIPSEL-NEXT: addu $3, $7, $3 +; MIPSEL-NEXT: sltu $7, $13, $10 +; MIPSEL-NEXT: lui $10, 60010 +; MIPSEL-NEXT: or $8, $8, $12 +; MIPSEL-NEXT: srl $12, $13, 1 ; MIPSEL-NEXT: mul $1, $4, $1 ; MIPSEL-NEXT: mul $4, $5, $9 -; MIPSEL-NEXT: sll $5, $6, 1 -; MIPSEL-NEXT: lui $6, 60010 -; MIPSEL-NEXT: addu $7, $11, $7 -; MIPSEL-NEXT: addu $2, $2, $7 -; MIPSEL-NEXT: addu $2, $4, $2 -; MIPSEL-NEXT: addu $1, $5, $1 +; MIPSEL-NEXT: sll $5, $2, 1 +; MIPSEL-NEXT: ori $9, $10, 61135 +; MIPSEL-NEXT: sll $2, $6, 1 +; MIPSEL-NEXT: addu $6, $11, $7 +; MIPSEL-NEXT: addu $3, $3, $6 +; MIPSEL-NEXT: addu $3, $4, $3 ; MIPSEL-NEXT: addu $1, $2, $1 +; MIPSEL-NEXT: addu $1, $3, $1 ; MIPSEL-NEXT: sll $2, $1, 31 -; MIPSEL-NEXT: or $4, $10, $2 -; MIPSEL-NEXT: sltiu $2, $4, 13 -; MIPSEL-NEXT: xori $4, $4, 13 -; MIPSEL-NEXT: or $3, $3, $8 -; MIPSEL-NEXT: ori $5, $6, 61135 -; MIPSEL-NEXT: sltu $3, $3, $5 -; MIPSEL-NEXT: movz $2, $3, $4 +; MIPSEL-NEXT: or $3, $12, $2 +; MIPSEL-NEXT: sltiu $2, $3, 13 +; MIPSEL-NEXT: xori $3, $3, 13 +; MIPSEL-NEXT: sltu $4, $8, $9 +; MIPSEL-NEXT: movz $2, $4, $3 ; MIPSEL-NEXT: andi $1, $1, 2 ; MIPSEL-NEXT: srl $1, $1, 1 -; MIPSEL-NEXT: or $1, $1, $12 +; MIPSEL-NEXT: or $1, $1, $5 ; MIPSEL-NEXT: andi $1, $1, 3 ; MIPSEL-NEXT: jr $ra ; MIPSEL-NEXT: movn $2, $zero, $1 Index: llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll =================================================================== --- llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll +++ llvm/test/CodeGen/PowerPC/ppc-32bit-build-vector.ll @@ -8,51 +8,38 @@ define dso_local fastcc void @BuildVectorICE() unnamed_addr { ; 32BIT-LABEL: BuildVectorICE: ; 32BIT: # %bb.0: # %entry -; 32BIT-NEXT: stwu 1, -64(1) -; 32BIT-NEXT: .cfi_def_cfa_offset 64 -; 32BIT-NEXT: li 3, .LCPI0_0@l -; 32BIT-NEXT: lis 4, .LCPI0_0@ha -; 32BIT-NEXT: addi 5, 1, 16 -; 32BIT-NEXT: addi 6, 1, 48 -; 32BIT-NEXT: li 7, 0 +; 32BIT-NEXT: stwu 1, -48(1) +; 32BIT-NEXT: .cfi_def_cfa_offset 48 ; 32BIT-NEXT: lxvw4x 34, 0, 3 -; 32BIT-NEXT: lxvw4x 35, 4, 3 -; 32BIT-NEXT: li 3, 0 +; 32BIT-NEXT: li 5, 0 +; 32BIT-NEXT: addi 3, 1, 16 ; 32BIT-NEXT: addi 4, 1, 32 -; 32BIT-NEXT: .p2align 4 +; 32BIT-NEXT: xxspltw 35, 34, 1 +; 32BIT-NEXT: .p2align 5 ; 32BIT-NEXT: .LBB0_1: # %while.body ; 32BIT-NEXT: # -; 32BIT-NEXT: stw 3, 32(1) -; 32BIT-NEXT: stw 7, 16(1) -; 32BIT-NEXT: lxvw4x 36, 0, 4 -; 32BIT-NEXT: lxvw4x 37, 0, 5 -; 32BIT-NEXT: vperm 4, 5, 4, 3 +; 32BIT-NEXT: stw 5, 16(1) +; 32BIT-NEXT: lxvw4x 36, 0, 3 ; 32BIT-NEXT: vadduwm 4, 2, 4 -; 32BIT-NEXT: xxspltw 37, 36, 1 -; 32BIT-NEXT: vadduwm 4, 4, 5 -; 32BIT-NEXT: stxvw4x 36, 0, 6 -; 32BIT-NEXT: lwz 7, 48(1) +; 32BIT-NEXT: vadduwm 4, 4, 3 +; 32BIT-NEXT: stxvw4x 36, 0, 4 +; 32BIT-NEXT: lwz 5, 32(1) ; 32BIT-NEXT: b .LBB0_1 ; ; 64BIT-LABEL: BuildVectorICE: ; 64BIT: # %bb.0: # %entry -; 64BIT-NEXT: li 3, 0 ; 64BIT-NEXT: lxvw4x 34, 0, 3 -; 64BIT-NEXT: rldimi 3, 3, 32, 0 -; 64BIT-NEXT: mtfprd 0, 3 ; 64BIT-NEXT: li 3, 0 -; 64BIT-NEXT: .p2align 4 +; 64BIT-NEXT: xxspltw 35, 34, 1 +; 64BIT-NEXT: .p2align 5 ; 64BIT-NEXT: .LBB0_1: # %while.body ; 64BIT-NEXT: # -; 64BIT-NEXT: li 4, 0 -; 64BIT-NEXT: rldimi 4, 3, 32, 0 -; 64BIT-NEXT: mtfprd 1, 4 -; 64BIT-NEXT: xxmrghd 35, 1, 0 -; 64BIT-NEXT: vadduwm 3, 2, 3 -; 64BIT-NEXT: xxspltw 36, 35, 1 -; 64BIT-NEXT: vadduwm 3, 3, 4 -; 64BIT-NEXT: xxsldwi 1, 35, 35, 3 -; 64BIT-NEXT: mffprwz 3, 1 +; 64BIT-NEXT: sldi 3, 3, 32 +; 64BIT-NEXT: mtvsrd 36, 3 +; 64BIT-NEXT: vadduwm 4, 2, 4 +; 64BIT-NEXT: vadduwm 4, 4, 3 +; 64BIT-NEXT: xxsldwi 0, 36, 36, 3 +; 64BIT-NEXT: mffprwz 3, 0 ; 64BIT-NEXT: b .LBB0_1 entry: br label %while.body Index: llvm/test/CodeGen/RISCV/div-by-constant.ll =================================================================== --- llvm/test/CodeGen/RISCV/div-by-constant.ll +++ llvm/test/CodeGen/RISCV/div-by-constant.ll @@ -295,8 +295,8 @@ ; RV64-NEXT: lui a1, 419430 ; RV64-NEXT: addiw a1, a1, 1639 ; RV64-NEXT: mul a0, a0, a1 -; RV64-NEXT: srli a1, a0, 63 ; RV64-NEXT: srai a0, a0, 33 +; RV64-NEXT: srliw a1, a0, 31 ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: ret %1 = sdiv i32 %a, 5 @@ -736,8 +736,9 @@ ; RV32IM-NEXT: lui a1, 6 ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: mul a0, a0, a1 -; RV32IM-NEXT: srli a1, a0, 31 ; RV32IM-NEXT: srai a0, a0, 17 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 31 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; @@ -747,8 +748,9 @@ ; RV32IMZB-NEXT: lui a1, 6 ; RV32IMZB-NEXT: addi a1, a1, 1639 ; RV32IMZB-NEXT: mul a0, a0, a1 -; RV32IMZB-NEXT: srli a1, a0, 31 ; RV32IMZB-NEXT: srai a0, a0, 17 +; RV32IMZB-NEXT: slli a1, a0, 16 +; RV32IMZB-NEXT: srli a1, a1, 31 ; RV32IMZB-NEXT: add a0, a0, a1 ; RV32IMZB-NEXT: ret ; Index: llvm/test/CodeGen/RISCV/div.ll =================================================================== --- llvm/test/CodeGen/RISCV/div.ll +++ llvm/test/CodeGen/RISCV/div.ll @@ -638,8 +638,8 @@ ; RV64IM-NEXT: lui a1, 419430 ; RV64IM-NEXT: addiw a1, a1, 1639 ; RV64IM-NEXT: mul a0, a0, a1 -; RV64IM-NEXT: srli a1, a0, 63 ; RV64IM-NEXT: srai a0, a0, 33 +; RV64IM-NEXT: srliw a1, a0, 31 ; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret %1 = sdiv i32 %a, 5 @@ -1171,8 +1171,9 @@ ; RV32IM-NEXT: lui a1, 6 ; RV32IM-NEXT: addi a1, a1, 1639 ; RV32IM-NEXT: mul a0, a0, a1 -; RV32IM-NEXT: srli a1, a0, 31 ; RV32IM-NEXT: srai a0, a0, 17 +; RV32IM-NEXT: slli a1, a0, 16 +; RV32IM-NEXT: srli a1, a1, 31 ; RV32IM-NEXT: add a0, a0, a1 ; RV32IM-NEXT: ret ; Index: llvm/test/CodeGen/RISCV/srem-lkk.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-lkk.ll +++ llvm/test/CodeGen/RISCV/srem-lkk.ll @@ -95,8 +95,8 @@ ; RV64IM-NEXT: lui a2, 253241 ; RV64IM-NEXT: addiw a2, a2, -15 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: srli a2, a1, 63 ; RV64IM-NEXT: srai a1, a1, 40 +; RV64IM-NEXT: srliw a2, a1, 31 ; RV64IM-NEXT: addw a1, a1, a2 ; RV64IM-NEXT: li a2, 1060 ; RV64IM-NEXT: mulw a1, a1, a2 @@ -143,8 +143,8 @@ ; RV64IM-NEXT: lui a2, 677296 ; RV64IM-NEXT: addiw a2, a2, -91 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: srli a2, a1, 63 ; RV64IM-NEXT: srai a1, a1, 40 +; RV64IM-NEXT: srliw a2, a1, 31 ; RV64IM-NEXT: addw a1, a1, a2 ; RV64IM-NEXT: li a2, -723 ; RV64IM-NEXT: mulw a1, a1, a2 @@ -194,8 +194,8 @@ ; RV64IM-NEXT: lui a2, 1036895 ; RV64IM-NEXT: addiw a2, a2, 999 ; RV64IM-NEXT: mul a1, a1, a2 -; RV64IM-NEXT: srli a2, a1, 63 ; RV64IM-NEXT: srai a1, a1, 40 +; RV64IM-NEXT: srliw a2, a1, 31 ; RV64IM-NEXT: addw a1, a1, a2 ; RV64IM-NEXT: lui a2, 1048570 ; RV64IM-NEXT: addiw a2, a2, 1595 Index: llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll =================================================================== --- llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll +++ llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll @@ -416,8 +416,8 @@ ; RV64-NEXT: call __muldi3@plt ; RV64-NEXT: lui a1, %hi(.LCPI3_1) ; RV64-NEXT: ld a1, %lo(.LCPI3_1)(a1) -; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: slli a2, a0, 63 +; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: or a0, a0, a2 ; RV64-NEXT: sltu a0, a1, a0 @@ -567,15 +567,15 @@ ; RV64M-NEXT: add a1, a1, a4 ; RV64M-NEXT: addi a1, a1, -2 ; RV64M-NEXT: seqz a1, a1 +; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: lui a4, %hi(.LCPI3_2) ; RV64M-NEXT: ld a4, %lo(.LCPI3_2)(a4) ; RV64M-NEXT: lui a5, %hi(.LCPI3_3) ; RV64M-NEXT: ld a5, %lo(.LCPI3_3)(a5) -; RV64M-NEXT: addi a2, a2, -1 ; RV64M-NEXT: seqz a2, a2 ; RV64M-NEXT: mul a3, a3, a4 -; RV64M-NEXT: add a3, a3, a5 ; RV64M-NEXT: slli a4, a3, 63 +; RV64M-NEXT: add a3, a3, a5 ; RV64M-NEXT: srli a3, a3, 1 ; RV64M-NEXT: or a3, a3, a4 ; RV64M-NEXT: sltu a3, a5, a3 Index: llvm/test/CodeGen/SystemZ/shift-08.ll =================================================================== --- llvm/test/CodeGen/SystemZ/shift-08.ll +++ llvm/test/CodeGen/SystemZ/shift-08.ll @@ -118,7 +118,6 @@ define i64 @f9(i64 %a, i64 %amt) { ; CHECK-LABEL: f9: ; CHECK: # %bb.0: -; CHECK-NEXT: afi %r3, 524288 ; CHECK-NEXT: rllg %r2, %r2, 0(%r3) ; CHECK-NEXT: br %r14 %add = add i64 %amt, 524288 Index: llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6589,16 +6589,16 @@ ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X86-NEXT: vpsllq $32, %xmm2, %xmm2 -; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 +; X86-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; X86-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm1 +; X86-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X86-NEXT: vpsllq $32, %xmm1, %xmm1 +; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx ; X86-NEXT: vzeroupper @@ -6623,16 +6623,16 @@ ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X64-NEXT: vpsllq $32, %xmm2, %xmm2 -; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 +; X64-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm1 +; X64-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq @@ -6794,16 +6794,16 @@ ; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X86-NEXT: vpsllq $32, %xmm2, %xmm2 -; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 +; X86-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; X86-NEXT: vpsrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm1 +; X86-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X86-NEXT: vpsllq $32, %xmm1, %xmm1 +; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 +; X86-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovd %xmm0, %eax ; X86-NEXT: vpextrd $1, %xmm0, %edx ; X86-NEXT: vzeroupper @@ -6831,16 +6831,16 @@ ; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] -; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 -; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X64-NEXT: vpsllq $32, %xmm2, %xmm2 -; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 +; X64-NEXT: vpmuludq %xmm3, %xmm2, %xmm3 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm1 +; X64-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; X64-NEXT: vpsllq $32, %xmm1, %xmm1 +; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm0 +; X64-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vmovq %xmm0, %rax ; X64-NEXT: vzeroupper ; X64-NEXT: retq Index: llvm/test/CodeGen/X86/combine-pmuldq.ll =================================================================== --- llvm/test/CodeGen/X86/combine-pmuldq.ll +++ llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -202,46 +202,39 @@ ; SSE-LABEL: PR43159: ; SSE: # %bb.0: # %entry ; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $1, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE-NEXT: psubd %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrld $1, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; SSE-NEXT: paddd %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: psrld $7, %xmm0 -; SSE-NEXT: psrld $6, %xmm2 -; SSE-NEXT: movd %xmm2, %edi +; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE-NEXT: psrld $6, %xmm1 +; SSE-NEXT: movd %xmm1, %edi ; SSE-NEXT: pextrd $1, %xmm0, %esi -; SSE-NEXT: pextrd $2, %xmm2, %edx +; SSE-NEXT: pextrd $2, %xmm1, %edx ; SSE-NEXT: pextrd $3, %xmm0, %ecx ; SSE-NEXT: jmp foo # TAILCALL ; ; AVX1-LABEL: PR43159: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsrld $7, %xmm1, %xmm1 +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5],xmm0[6,7] ; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX1-NEXT: vpsrld $6, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %edi ; AVX1-NEXT: vpextrd $1, %xmm1, %esi Index: llvm/test/CodeGen/X86/combine-sdiv.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sdiv.ll +++ llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2639,37 +2639,36 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_nonuniform5: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32639,54613,19945,21846,2979,5243,32897,32833] +; SSE2-NEXT: pmulhw %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,0] +; SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,0] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,0,65535,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pandn %xmm3, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,0,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psraw $4, %xmm3 +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,0,65535] ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: psraw $1, %xmm1 +; SSE2-NEXT: psraw $2, %xmm1 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,65535] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pand %xmm1, %xmm3 +; SSE2-NEXT: psraw $1, %xmm2 +; SSE2-NEXT: pandn %xmm2, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: paddw %xmm2, %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_sdiv_nonuniform5: @@ -2677,41 +2676,40 @@ ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,1,1] ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: paddw %xmm0, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <256,16384,4096,u,u,32768,512,256> +; SSE41-NEXT: pmulhw %xmm1, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; SSE41-NEXT: psraw $1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; SSE41-NEXT: psrlw $15, %xmm1 ; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,32768,512,256> -; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: paddw %xmm2, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform5: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX1-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_sdiv_nonuniform5: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] +; AVX2-NEXT: vpsraw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5],xmm2[6,7] +; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-NEXT: vpsraw $1, %xmm0, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_nonuniform5: @@ -2752,33 +2750,33 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) { ; SSE2-LABEL: combine_vec_sdiv_nonuniform6: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,1,1,1,0] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,1,1,1,0] +; SSE2-NEXT: pmullw %xmm1, %xmm0 +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: paddw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,0,0,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $6, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,0,65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,0] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psraw $12, %xmm5 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psraw $8, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: psraw $6, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,0,65535,65535] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [0,65535,65535,65535,65535,0,65535,0] +; SSE2-NEXT: pand %xmm5, %xmm2 +; SSE2-NEXT: psraw $12, %xmm1 +; SSE2-NEXT: pandn %xmm1, %xmm5 +; SSE2-NEXT: por %xmm2, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pandn %xmm3, %xmm4 +; SSE2-NEXT: por %xmm5, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,0] -; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm4, %xmm2 ; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: psraw $1, %xmm3 -; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: psraw $1, %xmm4 +; SSE2-NEXT: pandn %xmm4, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: psrlw $15, %xmm0 ; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2791,13 +2789,13 @@ ; SSE41-NEXT: pmullw %xmm0, %xmm1 ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: paddw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <4,256,256,u,u,512,256,8> -; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; SSE41-NEXT: psrlw $15, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; SSE41-NEXT: paddw %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrlw $15, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; SSE41-NEXT: paddw %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_nonuniform6: @@ -2805,12 +2803,12 @@ ; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX1-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_sdiv_nonuniform6: @@ -2818,12 +2816,12 @@ ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4],xmm0[5,6,7] -; AVX2-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm2 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] +; AVX2-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] +; AVX2-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_nonuniform6: @@ -2910,15 +2908,14 @@ ; SSE2-LABEL: pr38658: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm2, %xmm3 -; SSE2-NEXT: paddb %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 +; SSE2-NEXT: psrlw $8, %xmm3 +; SSE2-NEXT: packuswb %xmm3, %xmm1 +; SSE2-NEXT: paddb %xmm1, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psraw $8, %xmm1 @@ -2938,20 +2935,21 @@ ; SSE41-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 ; SSE41-NEXT: packuswb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 ; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psllw $6, %xmm2 -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: packuswb %xmm1, %xmm2 -; SSE41-NEXT: psrlw $7, %xmm0 -; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; SSE41-NEXT: psraw $8, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psllw $6, %xmm3 +; SSE41-NEXT: psllw $8, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; SSE41-NEXT: psrlw $8, %xmm2 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; SSE41-NEXT: packuswb %xmm2, %xmm0 +; SSE41-NEXT: psrlw $7, %xmm1 +; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE41-NEXT: paddb %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: pr38658: @@ -2961,18 +2959,18 @@ ; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $6, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $6, %xmm2, %xmm3 +; AVX1-NEXT: vpsllw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] +; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: pr38658: Index: llvm/test/CodeGen/X86/combine-udiv.ll =================================================================== --- llvm/test/CodeGen/X86/combine-udiv.ll +++ llvm/test/CodeGen/X86/combine-udiv.ll @@ -417,19 +417,19 @@ ; ; AVX1-LABEL: combine_vec_udiv_by_shl_pow2b: ; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2b: Index: llvm/test/CodeGen/X86/dpbusd_const.ll =================================================================== --- llvm/test/CodeGen/X86/dpbusd_const.ll +++ llvm/test/CodeGen/X86/dpbusd_const.ll @@ -10,8 +10,8 @@ ; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: addl %edi, %eax ; ALL-NEXT: retq @@ -24,35 +24,17 @@ } define i32 @mul_4xi8_zc(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_zc: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_zc: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_zc: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_zc: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = zext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> %0, @@ -64,35 +46,39 @@ define i32 @mul_4xi4_cz(<4 x i4> %a, i32 %c) { ; AVXVNNI-LABEL: mul_4xi4_cz: ; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVXVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: {vex} vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax +; AVXVNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVXVNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVXVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVXVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVXVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVXVNNI-NEXT: vmovd %xmm0, %eax ; AVXVNNI-NEXT: addl %edi, %eax ; AVXVNNI-NEXT: retq ; ; AVX512VNNI-LABEL: mul_4xi4_cz: ; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 -; AVX512VNNI-NEXT: vmovd %xmm1, %eax +; AVX512VNNI-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] +; AVX512VNNI-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512VNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VNNI-NEXT: vmovd %xmm0, %eax ; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper ; AVX512VNNI-NEXT: retq ; ; AVX512VLVNNI-LABEL: mul_4xi4_cz: ; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpdpbusd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VLVNNI-NEXT: vmovd %xmm1, %eax +; AVX512VLVNNI-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512VLVNNI-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VLVNNI-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX512VLVNNI-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLVNNI-NEXT: vmovd %xmm0, %eax ; AVX512VLVNNI-NEXT: addl %edi, %eax ; AVX512VLVNNI-NEXT: retq entry: @@ -104,38 +90,17 @@ } define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) { -; AVXVNNI-LABEL: mul_4xi8_cs: -; AVXVNNI: # %bb.0: # %entry -; AVXVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVXVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVXVNNI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVXVNNI-NEXT: {vex} vpdpbusd %xmm0, %xmm2, %xmm1 -; AVXVNNI-NEXT: vmovd %xmm1, %eax -; AVXVNNI-NEXT: addl %edi, %eax -; AVXVNNI-NEXT: retq -; -; AVX512VNNI-LABEL: mul_4xi8_cs: -; AVX512VNNI: # %bb.0: # %entry -; AVX512VNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VNNI-NEXT: vpdpbusd %zmm0, %zmm1, %zmm2 -; AVX512VNNI-NEXT: vmovd %xmm2, %eax -; AVX512VNNI-NEXT: addl %edi, %eax -; AVX512VNNI-NEXT: vzeroupper -; AVX512VNNI-NEXT: retq -; -; AVX512VLVNNI-LABEL: mul_4xi8_cs: -; AVX512VLVNNI: # %bb.0: # %entry -; AVX512VLVNNI-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VLVNNI-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VLVNNI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512VLVNNI-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLVNNI-NEXT: vpdpbusd %xmm0, %xmm1, %xmm2 -; AVX512VLVNNI-NEXT: vmovd %xmm2, %eax -; AVX512VLVNNI-NEXT: addl %edi, %eax -; AVX512VLVNNI-NEXT: retq +; ALL-LABEL: mul_4xi8_cs: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vpmovsxbd %xmm0, %xmm0 +; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; ALL-NEXT: vmovd %xmm0, %eax +; ALL-NEXT: addl %edi, %eax +; ALL-NEXT: retq entry: %0 = sext <4 x i8> %a to <4 x i32> %1 = mul nsw <4 x i32> , %0 @@ -151,8 +116,8 @@ ; ALL-NEXT: vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] ; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; ALL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; ALL-NEXT: vmovd %xmm0, %eax ; ALL-NEXT: addl %edi, %eax ; ALL-NEXT: retq Index: llvm/test/CodeGen/X86/sad.ll =================================================================== --- llvm/test/CodeGen/X86/sad.ll +++ llvm/test/CodeGen/X86/sad.ll @@ -994,20 +994,50 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX-LABEL: sad_unroll_nonzero_initial: -; AVX: # %bb.0: # %bb -; AVX-NEXT: vmovdqu (%rdi), %xmm0 -; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdx), %xmm1 -; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq +; AVX1-LABEL: sad_unroll_nonzero_initial: +; AVX1: # %bb.0: # %bb +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqu (%rdx), %xmm1 +; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: incl %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_unroll_nonzero_initial: +; AVX2: # %bb.0: # %bb +; AVX2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqu (%rdx), %xmm1 +; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: retq +; +; AVX512-LABEL: sad_unroll_nonzero_initial: +; AVX512: # %bb.0: # %bb +; AVX512-NEXT: vmovdqu (%rdi), %xmm0 +; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu (%rdx), %xmm1 +; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 Index: llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll =================================================================== --- llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -2069,14 +2069,12 @@ ; CHECK-SSE2-LABEL: test_srem_odd_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -2142,14 +2140,12 @@ ; CHECK-SSE2-LABEL: test_srem_even_allones_and_poweroftwo_and_one: ; CHECK-SSE2: # %bb.0: ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] ; CHECK-SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; CHECK-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; CHECK-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; CHECK-SSE2-NEXT: psrlq $32, %xmm0 -; CHECK-SSE2-NEXT: por %xmm2, %xmm0 +; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1 +; CHECK-SSE2-NEXT: psrlq $32, %xmm1 +; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; CHECK-SSE2-NEXT: por %xmm1, %xmm0 ; CHECK-SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 Index: llvm/test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -101,17 +101,16 @@ ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: paddq %xmm0, %xmm3 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3] +; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pmuludq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm3, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; @@ -125,16 +124,16 @@ ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -149,16 +148,16 @@ ; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -173,16 +172,16 @@ ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -197,16 +196,16 @@ ; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -238,46 +237,43 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm3, %xmm5 +; SSE-NEXT: movdqa %xmm3, %xmm4 +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 +; SSE-NEXT: paddq %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrlq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: movdqa %xmm2, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm0, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm0, %xmm5 +; SSE-NEXT: psrlq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm5 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm5, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: paddq %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: pmuludq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; @@ -292,32 +288,32 @@ ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm4 +; AVX1-NEXT: vpmuludq %xmm1, %xmm4, %xmm4 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpmuludq %xmm5, %xmm0, %xmm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 +; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -331,25 +327,25 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -364,25 +360,25 @@ ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -397,25 +393,25 @@ ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX512BWVL-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq @@ -450,86 +446,79 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm2, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm8 +; SSE-NEXT: pmuludq %xmm6, %xmm9 +; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm8 +; SSE-NEXT: psllq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm6, %xmm2 +; SSE-NEXT: paddq %xmm2, %xmm8 +; SSE-NEXT: movdqa %xmm0, %xmm6 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm0, %xmm6 -; SSE-NEXT: paddq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm4, %xmm6 +; SSE-NEXT: movdqa %xmm4, %xmm9 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm0, %xmm9 +; SSE-NEXT: paddq %xmm6, %xmm9 +; SSE-NEXT: psllq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm6 +; SSE-NEXT: paddq %xmm0, %xmm9 +; SSE-NEXT: movdqa %xmm3, %xmm6 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: pmuludq %xmm7, %xmm6 +; SSE-NEXT: movdqa %xmm7, %xmm4 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm6 +; SSE-NEXT: pmuludq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm6, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm7, %xmm3 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm6 ; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm5, %xmm7 +; SSE-NEXT: psrlq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm1, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 ; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 +; SSE-NEXT: paddq %xmm1, %xmm7 +; SSE-NEXT: psrlq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm3, %xmm7 ; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: paddq %xmm7, %xmm4 +; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: paddq %xmm1, %xmm4 +; SSE-NEXT: psrlq $32, %xmm9 +; SSE-NEXT: pmuludq %xmm2, %xmm9 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm0, %xmm8 +; SSE-NEXT: paddq %xmm9, %xmm8 +; SSE-NEXT: psllq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: paddq %xmm0, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pmuludq %xmm1, %xmm8 ; SSE-NEXT: psrlq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: paddq %xmm8, %xmm4 ; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: paddq %xmm0, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] +; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: pmuludq %xmm1, %xmm4 +; SSE-NEXT: pmuludq %xmm0, %xmm2 +; SSE-NEXT: paddq %xmm4, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: retq ; @@ -540,70 +529,70 @@ ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 ; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 ; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm4, %xmm5 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm8 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm6 +; AVX1-NEXT: vpmuludq %xmm2, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 +; AVX1-NEXT: vpsllq $32, %xmm6, %xmm7 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpaddq %xmm7, %xmm6, %xmm9 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5 +; AVX1-NEXT: vpmuludq %xmm3, %xmm5, %xmm5 ; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 ; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 +; AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 +; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm7 +; AVX1-NEXT: vpmuludq %xmm7, %xmm0, %xmm7 +; AVX1-NEXT: vpaddq %xmm5, %xmm7, %xmm5 +; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm2 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm8, %xmm3 +; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm3 ; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 +; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -617,41 +606,41 @@ ; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm4 +; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 +; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm5 +; AVX2-NEXT: vpmuludq %ymm5, %ymm0, %ymm5 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 +; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm3 ; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 ; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm1 +; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -665,34 +654,34 @@ ; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 ; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm3 ; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vpaddq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpsllq $32, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovq %xmm0, %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -706,34 +695,34 @@ ; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 ; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 +; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm1 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpmuludq %zmm2, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %zmm2, %zmm3 ; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512BWVL-NEXT: vpaddq %zmm1, %zmm3, %zmm1 +; AVX512BWVL-NEXT: vpsllq $32, %zmm1, %zmm1 +; AVX512BWVL-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 +; AVX512BWVL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 +; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsrlq $32, %xmm2, %xmm3 ; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm3, %xmm1 +; AVX512BWVL-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 +; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 +; AVX512BWVL-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vmovq %xmm0, %rax ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq