Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -32194,6 +32194,27 @@ return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } + // Some mask scalar intrinsics rely on checking if only one bit is set + // and implement it in C code like this: + // A[0] = (U & 1) ? A[0] : W[0]; + // This creates some redundant instructions that break pattern matching. + // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y) + if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT && + Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + SDValue AndNode = Cond.getOperand(0); + if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ && + isNullConstant(Cond.getOperand(1)) && + isa(AndNode.getOperand(1)) && + cast(AndNode.getOperand(1))->getAPIntValue() == 1) { + // LHS and RHS swapped due to + // setcc outputting 1 when AND resulted in 0 and vice versa. + if (AndNode.getValueType() != MVT::i8) + AndNode = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AndNode); + return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS); + } + } + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. Index: lib/Target/X86/X86InstrAVX512.td =================================================================== --- lib/Target/X86/X86InstrAVX512.td +++ lib/Target/X86/X86InstrAVX512.td @@ -6633,6 +6633,63 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>; +multiclass avx512_scalar_fma_patterns { + let Predicates = [HasFMA, HasAVX512] in { + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"Zr_Int") + VR128:$src2, VR128:$src1, VR128:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"Zr_Intk") + VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src3), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"231"#Suffix#"Zr_Intk") + VR128:$src3, VK1WM:$mask, VR128:$src2, VR128:$src1)>; + + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT ZeroFP)))))), + (!cast(Prefix#"213"#Suffix#"Zr_Intkz") + VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>; + } +} + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; + //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA //===----------------------------------------------------------------------===// @@ -8435,6 +8492,42 @@ VEX_W, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; +multiclass avx512_masked_scalar { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast("V"#OpcPrefix#r_Intk) + _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + ZeroFP))), + (!cast("V"#OpcPrefix#r_Intkz) + OutMask, _.VT:$src2, _.VT:$src1)>; + } +} + +multiclass avx512_masked_scalar_imm ImmV, dag OutMask, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast("V"#OpcPrefix#r_Intk) + _.VT:$dst, OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), + (!cast("V"#OpcPrefix#r_Intkz) + OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + } +} + //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- @@ -10783,69 +10876,54 @@ // TODO: Some canonicalization in lowering would simplify the number of // patterns we have to try to match. -multiclass AVX512_scalar_math_f32_patterns { +multiclass AVX512_scalar_math_fp_patterns { let Predicates = [HasAVX512] in { // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), - FR32X:$src))))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst), (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + _.FRC:$src))))), + (!cast("V"#OpcPrefix#Zrr_Int) _.VT:$dst, + (COPY_TO_REGCLASS _.FRC:$src, VR128X))>; // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; + def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst), + (Op (_.VT VR128X:$dst), (_.VT VR128X:$src)))), + (!cast("V"#OpcPrefix#Zrr_Int) _.VT:$dst, _.VT:$src)>; // extracted masked scalar math op with insert via movss - def : Pat<(X86Movss (v4f32 VR128X:$src1), + def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector (X86selects VK1WM:$mask, - (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))), - FR32X:$src2), - FR32X:$src0))), - (!cast("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X), - VK1WM:$mask, v4f32:$src1, - (COPY_TO_REGCLASS FR32X:$src2, VR128X))>; - } -} - -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; - -multiclass AVX512_scalar_math_f64_patterns { - let Predicates = [HasAVX512] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), - FR64X:$src))))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), + _.FRC:$src0))), + (!cast("V"#OpcPrefix#Zrr_Intk) (COPY_TO_REGCLASS _.FRC:$src0, VR128X), + VK1WM:$mask, _.VT:$src1, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; + // extracted masked scalar math op with insert via movss - def : Pat<(X86Movsd (v2f64 VR128X:$src1), + def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector (X86selects VK1WM:$mask, - (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))), - FR64X:$src2), - FR64X:$src0))), - (!cast("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X), - VK1WM:$mask, v2f64:$src1, - (COPY_TO_REGCLASS FR64X:$src2, VR128X))>; + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), (_.EltVT ZeroFP)))), + (!cast("V"#OpcPrefix#Zrr_Intkz) + VK1WM:$mask, _.VT:$src1, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; } } -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; + +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; + //===----------------------------------------------------------------------===// // AES instructions Index: lib/Target/X86/X86InstrFMA.td =================================================================== --- lib/Target/X86/X86InstrFMA.td +++ lib/Target/X86/X86InstrFMA.td @@ -364,6 +364,28 @@ defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub, SchedWriteFMA.Scl>, VEX_LIG; +multiclass scalar_fma_patterns { + let Predicates = [HasFMA, NoAVX512] in { + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"r_Int") + VR128:$src2, VR128:$src1, VR128:$src3)>; + } +} + +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; + +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; + //===----------------------------------------------------------------------===// // FMA4 - AMD 4 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -2679,78 +2679,49 @@ // TODO: Some canonicalization in lowering would simplify the number of // patterns we have to try to match. -multiclass scalar_math_f32_patterns { - let Predicates = [UseSSE1] in { - // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - - // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - } - - // Repeat everything for AVX. - let Predicates = [UseAVX] in { - // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - - // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - } -} - -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; - -multiclass scalar_math_f64_patterns { - let Predicates = [UseSSE2] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } - - // Repeat everything for AVX. - let Predicates = [UseAVX] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } -} - -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; - +multiclass scalar_math_patterns { + let Predicates = [BasePredicate] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast(OpcPrefix#rr_Int) VT:$dst, + (COPY_TO_REGCLASS RC:$src, VR128))>; + + // vector math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (Op (VT VR128:$dst), (VT VR128:$src)))), + (!cast(OpcPrefix#rr_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast("V"#OpcPrefix#rr_Int) VT:$dst, + (COPY_TO_REGCLASS RC:$src, VR128))>; + + // vector math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (Op (VT VR128:$dst), (VT VR128:$src)))), + (!cast("V"#OpcPrefix#rr_Int) VT:$dst, VT:$src)>; + } +} + +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; + +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -2979,13 +2950,42 @@ // There is no f64 version of the reciprocal approximation instructions. -// TODO: We should add *scalar* op patterns for these just like we have for -// the binops above. If the binop and unop patterns could all be unified -// that would be even better. +multiclass scalar_unary_math_patterns { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } +} + +multiclass scalar_unary_math_imm_patterns ImmV, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } +} -multiclass scalar_unary_math_patterns { +multiclass scalar_unary_math_intr_patterns { let Predicates = [BasePredicate] in { def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; @@ -2998,14 +2998,14 @@ } } -defm : scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; //===----------------------------------------------------------------------===// Index: test/CodeGen/X86/combine-select.ll =================================================================== --- test/CodeGen/X86/combine-select.ll +++ test/CodeGen/X86/combine-select.ll @@ -4,12 +4,8 @@ define <4 x float> @select_mask_add_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_add_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -26,13 +22,8 @@ define <4 x float> @select_maskz_add_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_add_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -48,12 +39,8 @@ define <4 x float> @select_mask_sub_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_sub_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -70,13 +57,8 @@ define <4 x float> @select_maskz_sub_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_sub_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -92,12 +74,8 @@ define <4 x float> @select_mask_mul_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_mul_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -114,13 +92,8 @@ define <4 x float> @select_maskz_mul_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_mul_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 Index: test/CodeGen/X86/fma-scalar-combine.ll =================================================================== --- test/CodeGen/X86/fma-scalar-combine.ll +++ test/CodeGen/X86/fma-scalar-combine.ll @@ -4,10 +4,9 @@ define <2 x double> @combine_scalar_mask_fmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmadd_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -29,10 +28,9 @@ define <2 x double> @combine_scalar_mask_fmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmadd_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -50,10 +48,9 @@ define <2 x double> @combine_scalar_maskz_fmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -75,10 +72,9 @@ define <2 x double> @combine_scalar_maskz_fmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -96,10 +92,9 @@ define <2 x double> @combine_scalar_mask3_fmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -114,7 +109,7 @@ %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -122,10 +117,9 @@ define <2 x double> @combine_scalar_mask3_fmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -137,17 +131,16 @@ %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmsub_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -169,10 +162,9 @@ define <2 x double> @combine_scalar_mask_fmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmsub_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -190,10 +182,9 @@ define <2 x double> @combine_scalar_maskz_fmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -215,10 +206,9 @@ define <2 x double> @combine_scalar_maskz_fmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -236,10 +226,9 @@ define <2 x double> @combine_scalar_mask3_fmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -254,7 +243,7 @@ %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -262,10 +251,9 @@ define <2 x double> @combine_scalar_mask3_fmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -277,17 +265,16 @@ %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fnmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmadd_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -309,10 +296,9 @@ define <2 x double> @combine_scalar_mask_fnmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmadd_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -330,10 +316,9 @@ define <2 x double> @combine_scalar_maskz_fnmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -355,10 +340,9 @@ define <2 x double> @combine_scalar_maskz_fnmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -376,10 +360,9 @@ define <2 x double> @combine_scalar_mask3_fnmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -394,7 +377,7 @@ %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -402,10 +385,9 @@ define <2 x double> @combine_scalar_mask3_fnmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -417,17 +399,16 @@ %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fnmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmsub_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -450,10 +431,9 @@ define <2 x double> @combine_scalar_mask_fnmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmsub_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -472,10 +452,9 @@ define <2 x double> @combine_scalar_maskz_fnmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -498,10 +477,9 @@ define <2 x double> @combine_scalar_maskz_fnmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -520,10 +498,9 @@ define <2 x double> @combine_scalar_mask3_fnmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -539,7 +516,7 @@ %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -547,10 +524,9 @@ define <2 x double> @combine_scalar_mask3_fnmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -563,6 +539,6 @@ %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 }