Index: llvm/lib/Target/X86/X86InstrFMA.td =================================================================== --- llvm/lib/Target/X86/X86InstrFMA.td +++ llvm/lib/Target/X86/X86InstrFMA.td @@ -126,9 +126,22 @@ v4f64>, VEX_W; } -let Constraints = "$src1 = $dst" in { -multiclass fma3s_rm opc, string OpcodeStr, X86MemOperand x86memop, - RegisterClass RC, ValueType OpVT, PatFrag mem_frag, +// All source register operands of FMA instructions can be commuted. +// In many cases such commute transformation requres an opcode adjustment, +// for example, commuting the operands 1 and 2 in FMA*132 form would require +// an opcode change to FMA*231: +// FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2; +// --> +// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; +// Currently, the commute transformation is supported for only few FMA forms. +// That is the reason why \p IsRVariantCommutable and \p IsMVariantCommutable +// parameters are used here. +// The general commute operands optimization working for all forms is going +// to be implemented soon. (Please, see http://reviews.llvm.org/D13269 +// for details). +let Constraints = "$src1 = $dst", hasSideEffects = 0 in { +multiclass fma3s_rm opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0, SDPatternOperator OpNode = null_frag> { let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in @@ -136,8 +149,7 @@ (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; let mayLoad = 1, isCommutable = IsMVariantCommutable in def m : FMA3; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; } -} // Constraints = "$src1 = $dst" +} // Constraints = "$src1 = $dst", hasSideEffects = 0 + +// These FMA*_Int instructions are defined specially for being used when +// the scalar FMA intrinsics are lowered to machine instructions, and in that +// sence they are similar to existing ADD*_Int, SUB*_Int, MUL*_Int, etc. +// instructions. +// +// The FMA*_Int instructions are _TEMPORARILY_ defined as NOT commutable. +// The upper bits of the result of scalar FMA intrinsics must be copied from +// the upper bits of the 1st operand. So, commuting the 1st operand would +// invalidate the upper bits of the intrinsic result. +// The corresponding optimization which allows commuting 2nd and 3rd operands +// of FMA*_Int instructions has been developed and is waiting for +// code-review approval and checkin (Please see http://reviews.llvm.org/D13269). +let Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1, + hasSideEffects = 0 in { +multiclass fma3s_rm_int opc, string OpcodeStr, + Operand memopr, RegisterClass RC> { + def r_Int : FMA3; + + let mayLoad = 1 in + def m_Int : FMA3; +} +} // Constraints = "$src1 = $dst", isCommutable = 0, isCodeGenOnly =1, + // hasSideEffects = 0 multiclass fma3s_forms opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string PT2, Intrinsic Int, - SDNode OpNode, RegisterClass RC, ValueType OpVT, - X86MemOperand x86memop, Operand memop, PatFrag mem_frag, - ComplexPattern mem_cpat> { -let hasSideEffects = 0 in { - defm r132 : fma3s_rm; - // See the other defm of r231 for the explanation regarding the - // commutable flags. - defm r231 : fma3s_rm { + defm r132 : fma3s_rm; + defm r213 : fma3s_rm; + /* IsMVariantCommutable */ 1, + OpNode>; + defm r231 : fma3s_rm; } -// See the other defm of r213 for the explanation regarding the -// commutable flags. -defm r213 : fma3s_rm; +// The FMA 213 form is created for lowering of scalar FMA intrinscis +// to machine instructions. +// The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands +// of FMA 213 form. +// The FMA 231 form can be get only by commuting the 1st operand of 213 or 231 +// forms and is possible only after special analysis of all uses of the initial +// instruction. Such analysis do not exist yet and thus introducing the 231 +// form of FMA*_Int instructions is done using an optimistic assumption that +// such analysis will be implemented eventually. +multiclass fma3s_int_forms opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, + RegisterClass RC, Operand memop> { + defm r132 : fma3s_rm_int; + defm r213 : fma3s_rm_int; + defm r231 : fma3s_rm_int; } multiclass fma3s opc132, bits<8> opc213, bits<8> opc231, string OpStr, Intrinsic IntF32, Intrinsic IntF64, SDNode OpNode> { - defm SS : fma3s_forms; - defm SD : fma3s_forms, VEX_W; - -// These patterns use the 123 ordering, instead of 213, even though -// they match the intrinsic to the 213 version of the instruction. -// This is because src1 is tied to dest, and the scalar intrinsics -// require the pass-through values to come from the first source -// operand, not the second. + defm SS : fma3s_forms, + fma3s_int_forms; + defm SD : fma3s_forms, + fma3s_int_forms, + VEX_W; + + // These patterns use the 123 ordering, instead of 213, even though + // they match the intrinsic to the 213 version of the instruction. + // This is because src1 is tied to dest, and the scalar intrinsics + // require the pass-through values to come from the first source + // operand, not the second. def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS - (!cast(NAME#"SSr213r") + (!cast(NAME#"SSr213r_Int") (COPY_TO_REGCLASS $src1, FR32), (COPY_TO_REGCLASS $src2, FR32), (COPY_TO_REGCLASS $src3, FR32)), @@ -198,7 +254,7 @@ def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), (COPY_TO_REGCLASS - (!cast(NAME#"SDr213r") + (!cast(NAME#"SDr213r_Int") (COPY_TO_REGCLASS $src1, FR64), (COPY_TO_REGCLASS $src2, FR64), (COPY_TO_REGCLASS $src3, FR64)), Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1734,11 +1734,17 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFMADDSSr231r_Int, X86::VFMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFMADDSDr231r_Int, X86::VFMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, @@ -1754,11 +1760,17 @@ { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr231r_Int, X86::VFNMADDSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr231r_Int, X86::VFNMADDSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, + { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, @@ -1774,11 +1786,17 @@ { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr231r_Int, X86::VFMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr231r_Int, X86::VFMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, @@ -1794,11 +1812,17 @@ { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr231r_Int, X86::VFNMSUBSSr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr231r_Int, X86::VFNMSUBSDr231m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, + { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, Index: llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll =================================================================== --- llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll +++ llvm/test/CodeGen/X86/fma-intrinsics-phi-213-to-231.ll @@ -1,8 +1,337 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s -; CHECK-LABEL: fmaddsubpd_loop: -; CHECK: vfmaddsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <4 x double> @fmaddsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: fmaddsubpd_loop_128: +; CHECK: vfmaddsub231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fmaddsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddpd_loop_128: +; CHECK: vfmsubadd231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fmsubaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +; CHECK-LABEL: fmaddpd_loop_128: +; CHECK: vfmadd231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +; CHECK-LABEL: fmsubpd_loop_128: +; CHECK: vfmsub231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +; CHECK-LABEL: fnmaddpd_loop_128: +; CHECK: vfnmadd231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fnmaddpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +; CHECK-LABEL: fnmsubpd_loop_128: +; CHECK: vfnmsub231pd %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <2 x double> @fnmsubpd_loop_128(i32 %iter, <2 x double> %a, <2 x double> %b, <2 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <2 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a, <2 x double> %b, <2 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <2 x double> %c.addr.0 +} + +declare <2 x double> @llvm.x86.fma.vfmaddsub.pd(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.fma.vfmsubadd.pd(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x double>) +declare <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double>, <2 x double>, <2 x double>) + + +; CHECK-LABEL: fmaddsubps_loop_128: +; CHECK: vfmaddsub231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmaddsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubaddps_loop_128: +; CHECK: vfmsubadd231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmsubaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +; CHECK-LABEL: fmaddps_loop_128: +; CHECK: vfmadd231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +; CHECK-LABEL: fmsubps_loop_128: +; CHECK: vfmsub231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +; CHECK-LABEL: fnmaddps_loop_128: +; CHECK: vfnmadd231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fnmaddps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +; CHECK-LABEL: fnmsubps_loop_128: +; CHECK: vfnmsub231ps %xmm1, %xmm0, %xmm2 +; CHECK: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq +define <4 x float> @fnmsubps_loop_128(i32 %iter, <4 x float> %a, <4 x float> %b, <4 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a, <4 x float> %b, <4 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x float> %c.addr.0 +} + +declare <4 x float> @llvm.x86.fma.vfmaddsub.ps(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfmsubadd.ps(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) +declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x float>) + +; CHECK-LABEL: fmaddsubpd_loop_256: +; CHECK: vfmaddsub231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fmaddsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: br label %for.cond @@ -24,9 +353,11 @@ ret <4 x double> %c.addr.0 } -; CHECK-LABEL: fmsubaddpd_loop: -; CHECK: vfmsubadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <4 x double> @fmsubaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: fmsubaddpd_loop_256: +; CHECK: vfmsubadd231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fmsubaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: br label %for.cond @@ -48,9 +379,11 @@ ret <4 x double> %c.addr.0 } -; CHECK-LABEL: fmaddpd_loop: -; CHECK: vfmadd231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <4 x double> @fmaddpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: fmaddpd_loop_256: +; CHECK: vfmadd231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: br label %for.cond @@ -72,9 +405,11 @@ ret <4 x double> %c.addr.0 } -; CHECK-LABEL: fmsubpd_loop: -; CHECK: vfmsub231pd %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <4 x double> @fmsubpd_loop(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +; CHECK-LABEL: fmsubpd_loop_256: +; CHECK: vfmsub231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { entry: br label %for.cond @@ -96,15 +431,71 @@ ret <4 x double> %c.addr.0 } +; CHECK-LABEL: fnmaddpd_loop_256: +; CHECK: vfnmadd231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fnmaddpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + +; CHECK-LABEL: fnmsubpd_loop_256: +; CHECK: vfnmsub231pd %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <4 x double> @fnmsubpd_loop_256(i32 %iter, <4 x double> %a, <4 x double> %b, <4 x double> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <4 x double> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <4 x double> %c.addr.0 +} + declare <4 x double> @llvm.x86.fma.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>) declare <4 x double> @llvm.x86.fma.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>) declare <4 x double> @llvm.x86.fma.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) declare <4 x double> @llvm.x86.fma.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>) +declare <4 x double> @llvm.x86.fma.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>) -; CHECK-LABEL: fmaddsubps_loop: -; CHECK: vfmaddsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <8 x float> @fmaddsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fmaddsubps_loop_256: +; CHECK: vfmaddsub231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fmaddsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { entry: br label %for.cond @@ -126,9 +517,11 @@ ret <8 x float> %c.addr.0 } -; CHECK-LABEL: fmsubaddps_loop: -; CHECK: vfmsubadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <8 x float> @fmsubaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fmsubaddps_loop_256: +; CHECK: vfmsubadd231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fmsubaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { entry: br label %for.cond @@ -150,9 +543,11 @@ ret <8 x float> %c.addr.0 } -; CHECK-LABEL: fmaddps_loop: -; CHECK: vfmadd231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <8 x float> @fmaddps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fmaddps_loop_256: +; CHECK: vfmadd231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { entry: br label %for.cond @@ -174,9 +569,11 @@ ret <8 x float> %c.addr.0 } -; CHECK-LABEL: fmsubps_loop: -; CHECK: vfmsub231ps %ymm{{[0-9]+}}, %ymm{{[0-9]+}}, %ymm{{[0-9]+}} -define <8 x float> @fmsubps_loop(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +; CHECK-LABEL: fmsubps_loop_256: +; CHECK: vfmsub231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { entry: br label %for.cond @@ -198,7 +595,61 @@ ret <8 x float> %c.addr.0 } +; CHECK-LABEL: fnmaddps_loop_256: +; CHECK: vfnmadd231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fnmaddps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + +; CHECK-LABEL: fnmsubps_loop_256: +; CHECK: vfnmsub231ps %ymm1, %ymm0, %ymm2 +; CHECK: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: retq +define <8 x float> @fnmsubps_loop_256(i32 %iter, <8 x float> %a, <8 x float> %b, <8 x float> %c) { +entry: + br label %for.cond + +for.cond: + %c.addr.0 = phi <8 x float> [ %c, %entry ], [ %0, %for.inc ] + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, %iter + br i1 %cmp, label %for.body, label %for.end + +for.body: + br label %for.inc + +for.inc: + %0 = call <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c.addr.0) + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: + ret <8 x float> %c.addr.0 +} + declare <8 x float> @llvm.x86.fma.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>) declare <8 x float> @llvm.x86.fma.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>) declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) +declare <8 x float> @llvm.x86.fma.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>) Index: llvm/test/CodeGen/X86/fma-intrinsics-x86.ll =================================================================== --- llvm/test/CodeGen/X86/fma-intrinsics-x86.ll +++ llvm/test/CodeGen/X86/fma-intrinsics-x86.ll @@ -18,6 +18,21 @@ %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } + +define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddss %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -33,6 +48,21 @@ %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } + +define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmadd_bac_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmaddsd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -109,6 +139,21 @@ %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } + +define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubss %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -124,6 +169,21 @@ %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } + +define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfmsub_bac_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfmsubsd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -200,6 +260,21 @@ %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } + +define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddss %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -215,6 +290,21 @@ %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } + +define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmadd_bac_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmaddsd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { @@ -291,6 +381,21 @@ %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ret <4 x float> %res } + +define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_ss: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubss %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a1, <4 x float> %a0, <4 x float> %a2) + ret <4 x float> %res +} declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { @@ -306,6 +411,21 @@ %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ret <2 x double> %res } + +define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) #0 { +; CHECK-FMA-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-FMA: # BB#0: +; CHECK-FMA-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 +; CHECK-FMA-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-FMA-NEXT: retq +; +; CHECK-FMA4-LABEL: test_x86_fma_vfnmsub_bac_sd: +; CHECK-FMA4: # BB#0: +; CHECK-FMA4-NEXT: vfnmsubsd %xmm2, %xmm0, %xmm1, %xmm0 +; CHECK-FMA4-NEXT: retq + %res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a1, <2 x double> %a0, <2 x double> %a2) + ret <2 x double> %res +} declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) define <4 x float> @test_x86_fma_vfnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {