Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48952,6 +48952,73 @@ return DAG.getBitcast(VT, CFmul); } +/// This inverts a canonicalization in IR that replaces a variable select arm +/// with an identity constant. Codegen improves if we re-use the variable +/// operand rather than load a constant. This can also be converted into a +/// masked vector operation if the target supports it. +static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG, + bool ShouldCommuteOperands) { + // Match a select as operand 1. The identity constant that we are looking for + // is only valid as operand 1 of a non-commutative binop. + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (ShouldCommuteOperands) + std::swap(N0, N1); + + // TODO: Should this apply to scalar select too? + if (!N1.hasOneUse() || N1.getOpcode() != ISD::VSELECT) + return SDValue(); + + unsigned Opcode = N->getOpcode(); + EVT VT = N->getValueType(0); + SDValue Cond = N1.getOperand(0); + SDValue TVal = N1.getOperand(1); + SDValue FVal = N1.getOperand(2); + + // TODO: This (and possibly the entire function) belongs in a + // target-independent location. The cases should also match the IR + // function ConstantExpr::getBinOpIdentity(). + auto isIdentityConstantForOpcode = [](unsigned Opcode, SDValue V) { + if (ConstantFPSDNode *C = isConstOrConstSplatFP(V)) { + switch (Opcode) { + case ISD::FADD: // X + -0.0 --> X + return C->isZero() && C->isNegative(); + case ISD::FSUB: // X - 0.0 --> X + return C->isZero() && !C->isNegative(); + } + } + return false; + }; + + // This transform increaases uses of N0, so freeze it to be safe. + // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal) + if (isIdentityConstantForOpcode(Opcode, TVal)) { + SDValue F0 = DAG.getFreeze(N0); + SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags()); + return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO); + } + // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0 + if (isIdentityConstantForOpcode(Opcode, FVal)) { + SDValue F0 = DAG.getFreeze(N0); + SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags()); + return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0); + } + + return SDValue(); +} + +static SDValue combineBinopWithSelect(SDNode *N, SelectionDAG &DAG) { + if (SDValue Sel = foldSelectWithIdentityConstant(N, DAG, false)) + return Sel; + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isCommutativeBinOp(N->getOpcode())) + if (SDValue Sel = foldSelectWithIdentityConstant(N, DAG, true)) + return Sel; + + return SDValue(); +} + /// Do target-specific dag combines on floating-point adds/subs. static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -48961,6 +49028,9 @@ if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget)) return COp; + if (SDValue Sel = combineBinopWithSelect(N, DAG)) + return Sel; + return SDValue(); } Index: llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll =================================================================== --- llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll +++ llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll @@ -83,8 +83,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vsubph %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vsubph (%rsi), %zmm1, %zmm1 {%k1} {z} -; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vsubph (%rsi), %zmm1, %zmm1 +; CHECK-NEXT: vsubph %zmm1, %zmm0, %zmm0 {%k1} ; CHECK-NEXT: retq %mask = bitcast i32 %msk to <32 x i1> %val = load <32 x half>, <32 x half>* %ptr Index: llvm/test/CodeGen/X86/vector-bo-select.ll =================================================================== --- llvm/test/CodeGen/X86/vector-bo-select.ll +++ llvm/test/CodeGen/X86/vector-bo-select.ll @@ -7,19 +7,18 @@ ; AVX2-LABEL: fadd_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vaddps %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fadd_v4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vaddps %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -27,9 +26,8 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vbroadcastss {{.*#+}} xmm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 {%k1} -; AVX512VL-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vaddps %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 ; AVX512VL-NEXT: retq %s = select <4 x i1> %b, <4 x float> %y, <4 x float> %r = fadd <4 x float> %x, %s @@ -41,20 +39,19 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 -; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fadd_v8f32_commute: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} -; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vaddps %ymm2, %ymm1, %ymm0 +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fadd_v8f32_commute: @@ -62,9 +59,8 @@ ; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 ; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 -; AVX512VL-NEXT: vbroadcastss {{.*#+}} ymm0 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512VL-NEXT: vmovaps %ymm2, %ymm0 {%k1} -; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddps %ymm2, %ymm1, %ymm1 {%k1} +; AVX512VL-NEXT: vmovaps %ymm1, %ymm0 ; AVX512VL-NEXT: retq %s = select <8 x i1> %b, <8 x float> %y, <8 x float> %r = fadd <8 x float> %s, %x @@ -74,17 +70,17 @@ define <16 x float> @fadd_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; AVX2-LABEL: fadd_v16f32_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vaddps %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vaddps %ymm3, %ymm1, %ymm0 -; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm1 +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: fadd_v16f32_swap: @@ -92,8 +88,8 @@ ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vbroadcastss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2 {%k1} ; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> , <16 x float> %y %r = fadd <16 x float> %x, %s @@ -103,17 +99,17 @@ define <16 x float> @fadd_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; AVX2-LABEL: fadd_v16f32_commute_swap: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvps %ymm5, %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vaddps %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vblendvps %ymm0, %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vaddps %ymm1, %ymm3, %ymm0 -; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm1 +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: fadd_v16f32_commute_swap: @@ -121,8 +117,8 @@ ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vbroadcastss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2 {%k1} -; AVX512-NEXT: vaddps %zmm1, %zmm2, %zmm0 +; AVX512-NEXT: vaddps %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> , <16 x float> %y %r = fadd <16 x float> %s, %x @@ -133,18 +129,18 @@ ; AVX2-LABEL: fsub_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vsubps %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fsub_v4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 +; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vmovaps %zmm2, %zmm0 {%k1} {z} -; AVX512F-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: vsubps %xmm2, %xmm1, %xmm0 +; AVX512F-NEXT: vblendmps %zmm0, %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; @@ -152,14 +148,16 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmovaps %xmm2, %xmm0 {%k1} {z} -; AVX512VL-NEXT: vsubps %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vsubps %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-NEXT: vmovaps %xmm1, %xmm0 ; AVX512VL-NEXT: retq %s = select <4 x i1> %b, <4 x float> %y, <4 x float> zeroinitializer %r = fsub <4 x float> %x, %s ret <4 x float> %r } +; negative test - fsub is not commutative; there is no identity constant for operand 0 + define <8 x float> @fsub_v8f32_commute(<8 x i1> %b, <8 x float> noundef %x, <8 x float> noundef %y) { ; AVX2-LABEL: fsub_v8f32_commute: ; AVX2: # %bb.0: @@ -196,33 +194,34 @@ define <16 x float> @fsub_v16f32_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; AVX2-LABEL: fsub_v16f32_swap: ; AVX2: # %bb.0: +; AVX2-NEXT: vsubps %ymm4, %ymm2, %ymm4 ; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero ; AVX2-NEXT: vpslld $31, %ymm5, %ymm5 -; AVX2-NEXT: vpsrad $31, %ymm5, %ymm5 -; AVX2-NEXT: vpandn %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvps %ymm5, %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vsubps %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 -; AVX2-NEXT: vpandn %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vsubps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vsubps %ymm4, %ymm2, %ymm1 +; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0 +; AVX2-NEXT: vmovaps %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: fsub_v16f32_swap: ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vmovaps %zmm2, %zmm0 {%k1} {z} -; AVX512-NEXT: vsubps %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vsubps %zmm2, %zmm1, %zmm0 +; AVX512-NEXT: vmovaps %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x float> zeroinitializer, <16 x float> %y %r = fsub <16 x float> %x, %s ret <16 x float> %r } +; negative test - fsub is not commutative; there is no identity constant for operand 0 + define <16 x float> @fsub_v16f32_commute_swap(<16 x i1> %b, <16 x float> noundef %x, <16 x float> noundef %y) { ; AVX2-LABEL: fsub_v16f32_commute_swap: ; AVX2: # %bb.0: @@ -301,26 +300,23 @@ ; AVX2-NEXT: negl %eax ; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fadd_v8f32_cast_cond: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512F-NEXT: vmovaps %zmm1, %zmm2 {%k1} -; AVX512F-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: vaddps %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fadd_v8f32_cast_cond: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX512VL-NEXT: vmovaps %ymm1, %ymm2 {%k1} -; AVX512VL-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vaddps %ymm1, %ymm0, %ymm0 {%k1} ; AVX512VL-NEXT: retq %b = bitcast i8 %pb to <8 x i1> %s = select <8 x i1> %b, <8 x float> %y, <8 x float> @@ -376,23 +372,23 @@ ; AVX2-NEXT: negl %eax ; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm3 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fsub_v8f32_cast_cond: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm1 {%k1} {z} -; AVX512F-NEXT: vsubps %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vsubps %ymm1, %ymm0, %ymm1 +; AVX512F-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: fsub_v8f32_cast_cond: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vmovaps %ymm1, %ymm1 {%k1} {z} -; AVX512VL-NEXT: vsubps %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vsubps %ymm1, %ymm0, %ymm0 {%k1} ; AVX512VL-NEXT: retq %b = bitcast i8 %pb to <8 x i1> %s = select <8 x i1> %b, <8 x float> %y, <8 x float> zeroinitializer