diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -230,6 +230,7 @@ def llvm_x86mmx_ty : LLVMType; def llvm_ptrx86mmx_ty : LLVMPointerType; // <1 x i64>* +def llvm_v1i1_ty : LLVMType; // 1 x i1 def llvm_v2i1_ty : LLVMType; // 2 x i1 def llvm_v4i1_ty : LLVMType; // 4 x i1 def llvm_v8i1_ty : LLVMType; // 8 x i1 diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1861,6 +1861,37 @@ llvm_v8i64_ty], [IntrNoMem]>; } +// AVX5124FMAPS +let TargetPrefix = "x86" in { + def int_x86_avx512_mask_v4fmadd_ps_512 : + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, + llvm_v16f32_ty, llvm_v16f32_ty, llvm_ptr_ty, + llvm_v16i1_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_x86_avx512_mask_v4fnmadd_ps_512 : + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, + llvm_v16f32_ty, llvm_v16f32_ty, llvm_ptr_ty, + llvm_v16i1_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_x86_avx512_mask_v4fmadd_ss : + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_v4f32_ty, llvm_ptr_ty, + llvm_v1i1_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_x86_avx512_mask_v4fnmadd_ss : + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, + llvm_v4f32_ty, llvm_v4f32_ty, llvm_ptr_ty, + llvm_v1i1_ty], + [IntrReadMem, IntrArgMemOnly]>; +} + // VNNI let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vpdpbusd_128 : @@ -1916,6 +1947,23 @@ llvm_v16i32_ty], [IntrNoMem]>; } +// AVX5124VNNIW +let TargetPrefix = "x86" in { + def int_x86_avx512_mask_vp4dpwssd_512 : + Intrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_v16i32_ty, llvm_ptr_ty, + llvm_v16i1_ty], + [IntrReadMem, IntrArgMemOnly]>; + + def int_x86_avx512_mask_vp4dpwssds_512 : + Intrinsic<[llvm_v16i32_ty], + [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, + llvm_v16i32_ty, llvm_v16i32_ty, llvm_ptr_ty, + llvm_v16i1_ty], + [IntrReadMem, IntrArgMemOnly]>; +} + //===----------------------------------------------------------------------===// // XOP diff --git a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h --- a/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h +++ b/llvm/include/llvm/Support/X86DisassemblerDecoderCommon.h @@ -422,6 +422,9 @@ ENUM_ENTRY(TYPE_XMM, "16-byte") \ ENUM_ENTRY(TYPE_YMM, "32-byte") \ ENUM_ENTRY(TYPE_ZMM, "64-byte") \ + ENUM_ENTRY(TYPE_XMM_QUAD, "16-byte quad") \ + ENUM_ENTRY(TYPE_YMM_QUAD, "32-byte quad") \ + ENUM_ENTRY(TYPE_ZMM_QUAD, "64-byte quad") \ ENUM_ENTRY(TYPE_VK, "mask register") \ ENUM_ENTRY(TYPE_VK_PAIR, "mask register pair") \ ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand") \ diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h --- a/llvm/lib/Target/X86/AsmParser/X86Operand.h +++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h @@ -481,6 +481,16 @@ X86MCRegisterClasses[X86::VK16RegClassID].contains(getReg()); } + bool isVR128XQuad() const { + return Kind == Register && + X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()); + } + + bool isVR512Quad() const { + return Kind == Register && + X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg()); + } + void addExpr(MCInst &Inst, const MCExpr *Expr) const { // Add as immediates when possible. if (const MCConstantExpr *CE = dyn_cast(Expr)) @@ -536,6 +546,81 @@ Inst.addOperand(MCOperand::createReg(Reg)); } + void addQuadRegOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + unsigned Reg = getReg(); + switch (Reg) { + default: llvm_unreachable("Unexpected register!"); + case X86::XMM1: case X86::XMM2: case X86::XMM3: + case X86::XMM5: case X86::XMM6: case X86::XMM7: + case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::XMM17: case X86::XMM18: case X86::XMM19: + case X86::XMM21: case X86::XMM22: case X86::XMM23: + case X86::XMM25: case X86::XMM26: case X86::XMM27: + case X86::XMM29: case X86::XMM30: case X86::XMM31: + case X86::ZMM1: case X86::ZMM2: case X86::ZMM3: + case X86::ZMM5: case X86::ZMM6: case X86::ZMM7: + case X86::ZMM9: case X86::ZMM10: case X86::ZMM11: + case X86::ZMM13: case X86::ZMM14: case X86::ZMM15: + case X86::ZMM17: case X86::ZMM18: case X86::ZMM19: + case X86::ZMM21: case X86::ZMM22: case X86::ZMM23: + case X86::ZMM25: case X86::ZMM26: case X86::ZMM27: + case X86::ZMM29: case X86::ZMM30: case X86::ZMM31: + // Leave the unaligned register alone so we can generate their encoding. + break; + case X86::XMM0: + Reg = X86::XMM0_XMM1_XMM2_XMM3; + break; + case X86::XMM4: + Reg = X86::XMM4_XMM5_XMM6_XMM7; + break; + case X86::XMM8: + Reg = X86::XMM8_XMM9_XMM10_XMM11; + break; + case X86::XMM12: + Reg = X86::XMM12_XMM13_XMM14_XMM15; + break; + case X86::XMM16: + Reg = X86::XMM16_XMM17_XMM18_XMM19; + break; + case X86::XMM20: + Reg = X86::XMM20_XMM21_XMM22_XMM23; + break; + case X86::XMM24: + Reg = X86::XMM24_XMM25_XMM26_XMM27; + break; + case X86::XMM28: + Reg = X86::XMM28_XMM29_XMM30_XMM31; + break; + case X86::ZMM0: + Reg = X86::ZMM0_ZMM1_ZMM2_ZMM3; + break; + case X86::ZMM4: + Reg = X86::ZMM4_ZMM5_ZMM6_ZMM7; + break; + case X86::ZMM8: + Reg = X86::ZMM8_ZMM9_ZMM10_ZMM11; + break; + case X86::ZMM12: + Reg = X86::ZMM12_ZMM13_ZMM14_ZMM15; + break; + case X86::ZMM16: + Reg = X86::ZMM16_ZMM17_ZMM18_ZMM19; + break; + case X86::ZMM20: + Reg = X86::ZMM20_ZMM21_ZMM22_ZMM23; + break; + case X86::ZMM24: + Reg = X86::ZMM24_ZMM25_ZMM26_ZMM27; + break; + case X86::ZMM28: + Reg = X86::ZMM28_ZMM29_ZMM30_ZMM31; + break; + } + Inst.addOperand(MCOperand::createReg(Reg)); + } + void addMemOperands(MCInst &Inst, unsigned N) const { assert((N == 5) && "Invalid number of operands!"); Inst.addOperand(MCOperand::createReg(getMemBaseReg())); diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp --- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -776,6 +776,10 @@ return prefix##_YMM0 + index; \ case TYPE_XMM: \ return prefix##_XMM0 + index; \ + case TYPE_ZMM_QUAD: \ + return prefix##_ZMM0_ZMM1_ZMM2_ZMM3 + (index / 4); \ + case TYPE_XMM_QUAD: \ + return prefix##_XMM0_XMM1_XMM2_XMM3 + (index / 4); \ case TYPE_VK: \ index &= 0xf; \ if (index > 7) \ diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h --- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -314,6 +314,26 @@ ENTRY(ZMM30) \ ENTRY(ZMM31) +#define REGS_XMM_QUADS \ + ENTRY(XMM0_XMM1_XMM2_XMM3) \ + ENTRY(XMM4_XMM5_XMM6_XMM7) \ + ENTRY(XMM8_XMM9_XMM10_XMM11) \ + ENTRY(XMM12_XMM13_XMM14_XMM15) \ + ENTRY(XMM16_XMM17_XMM18_XMM19) \ + ENTRY(XMM20_XMM21_XMM22_XMM23) \ + ENTRY(XMM24_XMM25_XMM26_XMM27) \ + ENTRY(XMM28_XMM29_XMM30_XMM31) + +#define REGS_ZMM_QUADS \ + ENTRY(ZMM0_ZMM1_ZMM2_ZMM3) \ + ENTRY(ZMM4_ZMM5_ZMM6_ZMM7) \ + ENTRY(ZMM8_ZMM9_ZMM10_ZMM11) \ + ENTRY(ZMM12_ZMM13_ZMM14_ZMM15) \ + ENTRY(ZMM16_ZMM17_ZMM18_ZMM19) \ + ENTRY(ZMM20_ZMM21_ZMM22_ZMM23) \ + ENTRY(ZMM24_ZMM25_ZMM26_ZMM27) \ + ENTRY(ZMM28_ZMM29_ZMM30_ZMM31) + #define REGS_MASKS \ ENTRY(K0) \ ENTRY(K1) \ @@ -398,6 +418,8 @@ REGS_XMM \ REGS_YMM \ REGS_ZMM \ + REGS_XMM_QUADS \ + REGS_ZMM_QUADS \ REGS_MASKS \ REGS_MASK_PAIRS \ REGS_SEGMENT \ diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h @@ -36,6 +36,7 @@ void printInstFlags(const MCInst *MI, raw_ostream &O); void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O); void printVKPair(const MCInst *MI, unsigned OpNo, raw_ostream &OS); + void printQuadReg(const MCInst *MI, unsigned OpNo, raw_ostream &OS); }; } // end namespace llvm diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp --- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp @@ -368,3 +368,78 @@ } llvm_unreachable("Unknown mask pair register name"); } + +void X86InstPrinterCommon::printQuadReg(const MCInst *MI, unsigned OpNo, + raw_ostream &OS) { + unsigned Reg = MI->getOperand(OpNo).getReg(); + switch (Reg) { + case X86::XMM1: case X86::XMM2: case X86::XMM3: + case X86::XMM5: case X86::XMM6: case X86::XMM7: + case X86::XMM9: case X86::XMM10: case X86::XMM11: + case X86::XMM13: case X86::XMM14: case X86::XMM15: + case X86::XMM17: case X86::XMM18: case X86::XMM19: + case X86::XMM21: case X86::XMM22: case X86::XMM23: + case X86::XMM25: case X86::XMM26: case X86::XMM27: + case X86::XMM29: case X86::XMM30: case X86::XMM31: + case X86::ZMM1: case X86::ZMM2: case X86::ZMM3: + case X86::ZMM5: case X86::ZMM6: case X86::ZMM7: + case X86::ZMM9: case X86::ZMM10: case X86::ZMM11: + case X86::ZMM13: case X86::ZMM14: case X86::ZMM15: + case X86::ZMM17: case X86::ZMM18: case X86::ZMM19: + case X86::ZMM21: case X86::ZMM22: case X86::ZMM23: + case X86::ZMM25: case X86::ZMM26: case X86::ZMM27: + case X86::ZMM29: case X86::ZMM30: case X86::ZMM31: + // We can get unaligned values from the parser, just print them. + printRegName(OS, Reg); + return; + case X86::XMM0_XMM1_XMM2_XMM3: + printRegName(OS, X86::XMM0); + return; + case X86::XMM4_XMM5_XMM6_XMM7: + printRegName(OS, X86::XMM4); + return; + case X86::XMM8_XMM9_XMM10_XMM11: + printRegName(OS, X86::XMM8); + return; + case X86::XMM12_XMM13_XMM14_XMM15: + printRegName(OS, X86::XMM12); + return; + case X86::XMM16_XMM17_XMM18_XMM19: + printRegName(OS, X86::XMM16); + return; + case X86::XMM20_XMM21_XMM22_XMM23: + printRegName(OS, X86::XMM20); + return; + case X86::XMM24_XMM25_XMM26_XMM27: + printRegName(OS, X86::XMM24); + return; + case X86::XMM28_XMM29_XMM30_XMM31: + printRegName(OS, X86::XMM28); + return; + case X86::ZMM0_ZMM1_ZMM2_ZMM3: + printRegName(OS, X86::ZMM0); + return; + case X86::ZMM4_ZMM5_ZMM6_ZMM7: + printRegName(OS, X86::ZMM4); + return; + case X86::ZMM8_ZMM9_ZMM10_ZMM11: + printRegName(OS, X86::ZMM8); + return; + case X86::ZMM12_ZMM13_ZMM14_ZMM15: + printRegName(OS, X86::ZMM12); + return; + case X86::ZMM16_ZMM17_ZMM18_ZMM19: + printRegName(OS, X86::ZMM16); + return; + case X86::ZMM20_ZMM21_ZMM22_ZMM23: + printRegName(OS, X86::ZMM20); + return; + case X86::ZMM24_ZMM25_ZMM26_ZMM27: + printRegName(OS, X86::ZMM24); + return; + case X86::ZMM28_ZMM29_ZMM30_ZMM31: + printRegName(OS, X86::ZMM28); + return; + } + llvm_unreachable("Unknown quad reg register name"); +} diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -133,6 +133,12 @@ def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", "Enable AVX-512 Exponential and Reciprocal Instructions", [FeatureAVX512]>; +def FeatureAVX5124FMAPS : SubtargetFeature<"avx5124fmaps", "HasAVX5124FMAPS", "true", + "Enable AVX-512 4FMAPS Instructions", + [FeatureAVX512]>; +def FeatureAVX5124VNNI : SubtargetFeature<"avx5124vnni", "HasAVX5124FMAPS", "true", + "Enable AVX-512 4VNNI Instructions", + [FeatureAVX512]>; def FeatureCDI : SubtargetFeature<"avx512cd", "HasCDI", "true", "Enable AVX-512 Conflict Detection Instructions", [FeatureAVX512]>; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -821,6 +821,14 @@ MGATHER, MSCATTER, + // AVX5124FMAPS + V4FMADD, + V4FNMADD, + + // AVX5124VNNI + VP4DPWSSD, + VP4DPWSSDS, + // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all // opcodes will be thought as target memory ops! diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4992,6 +4992,42 @@ const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { + switch (Intrinsic) { + case Intrinsic::x86_avx512_mask_vp4dpwssd_512: + case Intrinsic::x86_avx512_mask_vp4dpwssds_512: + case Intrinsic::x86_avx512_mask_v4fmadd_ps_512: + case Intrinsic::x86_avx512_mask_v4fnmadd_ps_512: + case Intrinsic::x86_avx512_mask_v4fmadd_ss: + case Intrinsic::x86_avx512_mask_v4fnmadd_ss: + switch (Intrinsic) { + default: llvm_unreachable("Unexpected intrinsic"); + case Intrinsic::x86_avx512_mask_vp4dpwssd_512: + Info.opc = X86ISD::VP4DPWSSD; + Info.memVT = MVT::v4i32; + break; + case Intrinsic::x86_avx512_mask_vp4dpwssds_512: + Info.opc = X86ISD::VP4DPWSSDS; + Info.memVT = MVT::v4i32; + break; + case Intrinsic::x86_avx512_mask_v4fmadd_ps_512: + case Intrinsic::x86_avx512_mask_v4fmadd_ss: + Info.opc = X86ISD::V4FMADD; + Info.memVT = MVT::v4f32; + break; + case Intrinsic::x86_avx512_mask_v4fnmadd_ps_512: + case Intrinsic::x86_avx512_mask_v4fnmadd_ss: + Info.opc = X86ISD::V4FNMADD; + Info.memVT = MVT::v4f32; + break; + } + + Info.ptrVal = I.getArgOperand(5); + Info.offset = 0; + Info.size = Info.memVT.getStoreSize(); + Info.align = Align(1); + Info.flags = MachineMemOperand::MOLoad; + return true; + } const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); if (!IntrData) @@ -30423,6 +30459,10 @@ NODE_NAME_CASE(VPDPBUSDS) NODE_NAME_CASE(VPDPWSSD) NODE_NAME_CASE(VPDPWSSDS) + NODE_NAME_CASE(VP4DPWSSD) + NODE_NAME_CASE(VP4DPWSSDS) + NODE_NAME_CASE(V4FMADD) + NODE_NAME_CASE(V4FNMADD) NODE_NAME_CASE(VPSHUFBITQMB) NODE_NAME_CASE(GF2P8MULB) NODE_NAME_CASE(GF2P8AFFINEQB) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -12130,31 +12130,114 @@ // AVX5124FMAPS //===----------------------------------------------------------------------===// +multiclass avx512_quad_pseudo { + def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2a, _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, (_.KVT immAllOnesV))), + (!cast(InstrStr#"rm") + _.RC:$src1, (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; + + def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2a, _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, _.KRCWM:$mask)), + (!cast(InstrStr#"rmkz") + _.RC:$src1, _.KRCWM:$mask, + (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; + + def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, _.RC:$src2a, + _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, + _.KRCWM:$mask), + _.RC:$src1)), + (!cast(InstrStr#"rmk") + _.RC:$src1, _.KRCWM:$mask, + (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; + + def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src1, _.RC:$src2a, + _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, + _.KRCWM:$mask), + _.ImmAllZerosV)), + (!cast(InstrStr#"rmkz") + _.RC:$src1, _.KRCWM:$mask, + (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; +} + +multiclass avx512_quad_pseudo_scalar { + def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2a, _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, (_.KVT immAllOnesV))), + (!cast(InstrStr#"rm") + _.RC:$src1, (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; + def : Pat<(_.VT (OpNode _.RC:$src1, _.RC:$src2a, _.RC:$src2b, _.RC:$src2c, + _.RC:$src2d, addr:$src3, _.KRCWM:$mask)), + (!cast(InstrStr#"rmkz") + _.RC:$src1, _.KRCWM:$mask, + (REG_SEQUENCE QuadRC, _.RC:$src2a, subreg0, + _.RC:$src2b, subreg1, + _.RC:$src2c, subreg2, + _.RC:$src2d, subreg3), + addr:$src3)>; +} + let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle, Constraints = "$src1 = $dst", Uses = [MXCSR], mayRaiseFPException = 1 in { -defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info, - (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), - "v4fmaddps", "$src3, $src2", "$src2, $src3", - []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, - Sched<[SchedWriteFMA.ZMM.Folded]>; - -defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info, - (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), - "v4fnmaddps", "$src3, $src2", "$src2, $src3", - []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, - Sched<[SchedWriteFMA.ZMM.Folded]>; - -defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info, - (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), - "v4fmaddss", "$src3, $src2", "$src2, $src3", - []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, - Sched<[SchedWriteFMA.Scl.Folded]>; - -defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info, - (outs VR128X:$dst), (ins VR128X:$src2, f128mem:$src3), - "v4fnmaddss", "$src3, $src2", "$src2, $src3", - []>, VEX_LIG, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>, - Sched<[SchedWriteFMA.Scl.Folded]>; + let hasExtraSrcRegAllocReq = 1 in { + defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info, + (outs VR512:$dst), (ins VR512Quad:$src2, f128mem:$src3), + "v4fmaddps", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>; + + defm V4FNMADDPSrm : AVX512_maskable_3src_in_asm<0xAA, MRMSrcMem, v16f32_info, + (outs VR512:$dst), (ins VR512Quad:$src2, f128mem:$src3), + "v4fnmaddps", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>; + + defm V4FMADDSSrm : AVX512_maskable_3src_in_asm<0x9B, MRMSrcMem, f32x_info, + (outs VR128X:$dst), (ins VR128XQuad:$src2, f128mem:$src3), + "v4fmaddss", "$src3, $src2", "$src2, $src3", + []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>; + + defm V4FNMADDSSrm : AVX512_maskable_3src_in_asm<0xAB, MRMSrcMem, f32x_info, + (outs VR128X:$dst), (ins VR128XQuad:$src2, f128mem:$src3), + "v4fnmaddss", "$src3, $src2", "$src2, $src3", + []>, EVEX_V128, EVEX_4V, T8XD, EVEX_CD8<32, CD8VF>; + } +} + +let Predicates = [HasAVX5124FMAPS] in { +defm : avx512_quad_pseudo<"V4FMADDPS", X86v4fmadd, v16f32_info, VR512QUAD, + zsub_0, zsub_1, zsub_2, zsub_3>; +defm : avx512_quad_pseudo<"V4FNMADDPS", X86v4fnmadd, v16f32_info, VR512QUAD, + zsub_0, zsub_1, zsub_2, zsub_3>; + +defm : avx512_quad_pseudo_scalar<"V4FMADDSS", X86v4fmadd, f32x_info, VR128XQUAD, + xsub_0, xsub_1, xsub_2, xsub_3>; +defm : avx512_quad_pseudo_scalar<"V4FNMADDSS", X86v4fnmadd, f32x_info, VR128XQUAD, + xsub_0, xsub_1, xsub_2, xsub_3>; } //===----------------------------------------------------------------------===// @@ -12163,19 +12246,30 @@ let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedInt, Constraints = "$src1 = $dst" in { -defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info, - (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), - "vp4dpwssd", "$src3, $src2", "$src2, $src3", - []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, - Sched<[SchedWriteFMA.ZMM.Folded]>; + let hasExtraSrcRegAllocReq = 1 in { + defm VP4DPWSSDrm : AVX512_maskable_3src_in_asm<0x52, MRMSrcMem, v16i32_info, + (outs VR512:$dst), (ins VR512Quad:$src2, i128mem:$src3), + "vp4dpwssd", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>; + + defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, + (outs VR512:$dst), (ins VR512Quad:$src2, i128mem:$src3), + "vp4dpwssds", "$src3, $src2", "$src2, $src3", + []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>; + } +} -defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info, - (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3), - "vp4dpwssds", "$src3, $src2", "$src2, $src3", - []>, EVEX_V512, EVEX_4V, T8XD, EVEX_CD8<32, CD8VQ>, - Sched<[SchedWriteFMA.ZMM.Folded]>; +let Predicates = [HasAVX5124VNNI] in { +defm : avx512_quad_pseudo<"VP4DPWSSD", X86vp4dpwssd, v16i32_info, VR512QUAD, + zsub_0, zsub_1, zsub_2, zsub_3>; +defm : avx512_quad_pseudo<"VP4DPWSSDS", X86vp4dpwssds, v16i32_info, VR512QUAD, + zsub_0, zsub_1, zsub_2, zsub_3>; } +//===----------------------------------------------------------------------===// +// VP2INTERSECT +//===----------------------------------------------------------------------===// + let hasSideEffects = 0 in { let mayStore = 1, SchedRW = [WriteFStoreX] in def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>; @@ -12183,10 +12277,6 @@ def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>; } -//===----------------------------------------------------------------------===// -// VP2INTERSECT -//===----------------------------------------------------------------------===// - multiclass avx512_vp2intersect_modes { def rr : I<0x68, MRMSrcReg, (outs _.KRPC:$dst), diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -579,6 +579,33 @@ def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; +// AVX5124VNNI +def SDTVnni4 : SDTypeProfile<1, 7, [SDTCisVT<0, v16i32>, + SDTCisVT<1, v16i32>, + SDTCisVT<2, v16i32>, + SDTCisVT<3, v16i32>, + SDTCisVT<4, v16i32>, + SDTCisVT<5, v16i32>, + SDTCisPtrTy<6>, + SDTCisVT<7, v16i1>]>; +def X86vp4dpwssd : SDNode<"X86ISD::VP4DPWSSD", SDTVnni4, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86vp4dpwssds : SDNode<"X86ISD::VP4DPWSSDS", SDTVnni4, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +// AVX5124FMAPS +def SDTv4fmadd : SDTypeProfile<1, 7, [SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<0, 3>, + SDTCisSameAs<0, 4>, + SDTCisSameAs<0, 5>, + SDTCisPtrTy<6>, + SDTCVecEltisVT<7, i1>]>; +def X86v4fmadd : SDNode<"X86ISD::V4FMADD", SDTv4fmadd, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86v4fnmadd : SDNode<"X86ISD::V4FNMADD", SDTv4fmadd, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOp>; def X86rsqrt28SAE: SDNode<"X86ISD::RSQRT28_SAE", SDTFPUnaryOp>; def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOp>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -807,6 +807,19 @@ let ParserMatchClass = VK16PairAsmOperand; } +let RenderMethod = "addQuadRegOperands" in { + def VR128XQuadAsmOperand : AsmOperandClass { let Name = "VR128XQuad"; } + def VR512QuadAsmOperand : AsmOperandClass { let Name = "VR512Quad"; } +} + +def VR128XQuad : RegisterOperand { + let ParserMatchClass = VR128XQuadAsmOperand; +} + +def VR512Quad : RegisterOperand { + let ParserMatchClass = VR512QuadAsmOperand; +} + //===----------------------------------------------------------------------===// // X86 Complex Pattern Definitions. // @@ -882,6 +895,8 @@ def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">; def HasPFI : Predicate<"Subtarget->hasPFI()">; def HasERI : Predicate<"Subtarget->hasERI()">; +def HasAVX5124FMAPS : Predicate<"Subtarget->hasAVX5124FMAPS()">; +def HasAVX5124VNNI : Predicate<"Subtarget->hasAVX5124VNNI()">; def HasDQI : Predicate<"Subtarget->hasDQI()">; def NoDQI : Predicate<"!Subtarget->hasDQI()">; def HasBWI : Predicate<"Subtarget->hasBWI()">; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -30,6 +30,18 @@ def sub_ymm : SubRegIndex<256>; def sub_mask_0 : SubRegIndex<-1>; def sub_mask_1 : SubRegIndex<-1, -1>; + def xsub_0 : SubRegIndex<128>; + def xsub_1 : SubRegIndex<128>; + def xsub_2 : SubRegIndex<128>; + def xsub_3 : SubRegIndex<128>; + def ysub_0 : SubRegIndex<256>; + def ysub_1 : SubRegIndex<256>; + def ysub_2 : SubRegIndex<256>; + def ysub_3 : SubRegIndex<256>; + def zsub_0 : SubRegIndex<512>; + def zsub_1 : SubRegIndex<512>; + def zsub_2 : SubRegIndex<512>; + def zsub_3 : SubRegIndex<512>; } //===----------------------------------------------------------------------===// @@ -592,6 +604,26 @@ def VR256X : RegisterClass<"X86", [v8f32, v4f64, v32i8, v16i16, v8i32, v4i64], 256, (sequence "YMM%u", 0, 31)>; +def VR128XQUADS : RegisterTuples<[xsub_0, xsub_1, xsub_2, xsub_3], + [(decimate VR128X, 4), + (decimate (shl VR128X, 1), 4), + (decimate (shl VR128X, 2), 4), + (decimate (shl VR128X, 3), 4)]>; + +def VR128XQUAD : RegisterClass<"X86", [untyped], 32, (add VR128XQUADS)> { + let Size = 128; +} + +def VR512QUADS : RegisterTuples<[zsub_0, zsub_1, zsub_2, zsub_3], + [(decimate VR512, 4), + (decimate (shl VR512, 1), 4), + (decimate (shl VR512, 2), 4), + (decimate (shl VR512, 3), 4)]>; + +def VR512QUAD : RegisterClass<"X86", [untyped], 512, (add VR512QUADS)> { + let Size = 2048; +} + // Mask registers def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} def VK2 : RegisterClass<"X86", [v2i1], 16, (add VK1)> {let Size = 16;} diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -333,6 +333,12 @@ /// Processor has AVX-512 Exponential and Reciprocal Instructions bool HasERI = false; + /// Processor has AVX-512 4FMAPS Instructions + bool HasAVX5124FMAPS = false; + + /// Processor has AVX-512 4VNNI Instructions + bool HasAVX5124VNNI = false; + /// Processor has AVX-512 Conflict Detection Instructions bool HasCDI = false; @@ -703,6 +709,8 @@ bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; } bool hasPFI() const { return HasPFI; } bool hasERI() const { return HasERI; } + bool hasAVX5124FMAPS() const { return HasAVX5124FMAPS; } + bool hasAVX5124VNNI() const { return HasAVX5124VNNI; } bool hasDQI() const { return HasDQI; } bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } diff --git a/llvm/test/CodeGen/X86/avx5124fmaps-intrinsics.ll b/llvm/test/CodeGen/X86/avx5124fmaps-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx5124fmaps-intrinsics.ll @@ -0,0 +1,158 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx5124fmaps | FileCheck %s + +declare <16 x float> @llvm.x86.avx512.mask.v4fmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, <16 x float>, <16 x float> %a4, i8*, <16 x i1>) +declare <16 x float> @llvm.x86.avx512.mask.v4fnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, <16 x float>, <16 x float> %a4, i8*, <16 x i1>) +declare <4 x float> @llvm.x86.avx512.mask.v4fmadd.ss(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> %a4, i8*, <1 x i1>) +declare <4 x float> @llvm.x86.avx512.mask.v4fnmadd.ss(<4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> %a4, i8*, <1 x i1>) + +define <16 x float> @fmaddps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr) { +; CHECK-LABEL: fmaddps: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm7 +; CHECK-NEXT: vmovaps %zmm3, %zmm6 +; CHECK-NEXT: vmovaps %zmm2, %zmm5 +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: v4fmaddps (%rdi), %zmm4, %zmm0 +; CHECK-NEXT: retq + %a = call <16 x float> @llvm.x86.avx512.mask.v4fmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> ) + ret <16 x float> %a +} + +define <16 x float> @fmaddps_mask_no_passthru(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fmaddps_mask_no_passthru: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fmaddps (%rdi), %zmm8, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + ret <16 x float> %a +} + +define <16 x float> @fmaddps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fmaddps_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fmaddps (%rdi), %zmm8, %zmm0 {%k1} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + %b = select <16 x i1> %m, <16 x float> %a, <16 x float> %a0 + ret <16 x float> %b +} + +define <16 x float> @fmaddps_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fmaddps_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fmaddps (%rdi), %zmm8, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + %b = select <16 x i1> %m, <16 x float> %a, <16 x float> zeroinitializer + ret <16 x float> %b +} + +define <16 x float> @fnmaddps(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr) { +; CHECK-LABEL: fnmaddps: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm7 +; CHECK-NEXT: vmovaps %zmm3, %zmm6 +; CHECK-NEXT: vmovaps %zmm2, %zmm5 +; CHECK-NEXT: vmovaps %zmm1, %zmm4 +; CHECK-NEXT: v4fnmaddps (%rdi), %zmm4, %zmm0 +; CHECK-NEXT: retq + %a = call <16 x float> @llvm.x86.avx512.mask.v4fnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> ) + ret <16 x float> %a +} + +define <16 x float> @fnmaddps_mask_no_passthru(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fnmaddps_mask_no_passthru: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fnmaddps (%rdi), %zmm8, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + ret <16 x float> %a +} + +define <16 x float> @fnmaddps_mask(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fnmaddps_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fnmaddps (%rdi), %zmm8, %zmm0 {%k1} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + %b = select <16 x i1> %m, <16 x float> %a, <16 x float> %a0 + ret <16 x float> %b +} + +define <16 x float> @fnmaddps_maskz(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i32> %mask) { +; CHECK-LABEL: fnmaddps_maskz: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %zmm4, %zmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %zmm3, %zmm10 +; CHECK-NEXT: vmovaps %zmm2, %zmm9 +; CHECK-NEXT: vmovaps %zmm1, %zmm8 +; CHECK-NEXT: v4fnmaddps (%rdi), %zmm8, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %m = icmp eq <16 x i32> %mask, zeroinitializer + %a = call <16 x float> @llvm.x86.avx512.mask.v4fnmadd.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, <16 x float> %a3, <16 x float> %a4, i8* %ptr, <16 x i1> %m) + %b = select <16 x i1> %m, <16 x float> %a, <16 x float> zeroinitializer + ret <16 x float> %b +} + +define <4 x float> @fmaddss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3, <4 x float> %a4, i8* %ptr) { +; CHECK-LABEL: fmaddss: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovaps %xmm4, %xmm7 +; CHECK-NEXT: vmovaps %xmm3, %xmm6 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vmovaps %xmm1, %xmm4 +; CHECK-NEXT: v4fmaddss (%rdi), %xmm4, %xmm0 +; CHECK-NEXT: retq + %a = call <4 x float> @llvm.x86.avx512.mask.v4fmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3, <4 x float> %a4, i8* %ptr, <1 x i1> ) + ret <4 x float> %a +} + +define <4 x float> @fmaddss_mask_no_passthru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3, <4 x float> %a4, i8* %ptr, <4 x i32> %mask) { +; CHECK-LABEL: fmaddss_mask_no_passthru: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm5 killed $xmm5 def $zmm5 +; CHECK-NEXT: vmovaps %xmm4, %xmm11 +; CHECK-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-NEXT: vmovaps %xmm3, %xmm10 +; CHECK-NEXT: vmovaps %xmm2, %xmm9 +; CHECK-NEXT: vmovaps %xmm1, %xmm8 +; CHECK-NEXT: v4fmaddss (%rdi), %xmm8, %xmm0 {%k1} {z} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %m = icmp eq <4 x i32> %mask, zeroinitializer + %m1 = shufflevector <4 x i1> %m, <4 x i1> undef, <1 x i32> + %a = call <4 x float> @llvm.x86.avx512.mask.v4fmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, <4 x float> %a3, <4 x float> %a4, i8* %ptr, <1 x i1> %m1) + ret <4 x float> %a +} diff --git a/llvm/test/CodeGen/X86/dynamic-regmask.ll b/llvm/test/CodeGen/X86/dynamic-regmask.ll --- a/llvm/test/CodeGen/X86/dynamic-regmask.ll +++ b/llvm/test/CodeGen/X86/dynamic-regmask.ll @@ -11,7 +11,7 @@ ret i32 %b2 } ; CHECK: name: caller -; CHECK: CALL64pcrel32 @callee, CustomRegMask($bh,$bl,$bp,$bph,$bpl,$bx,$ebp,$ebx,$esp,$hbp,$hbx,$hsp,$rbp,$rbx,$rsp,$sp,$sph,$spl,$r10,$r11,$r12,$r13,$r14,$r15,$xmm8,$xmm9,$xmm10,$xmm11,$xmm12,$xmm13,$xmm14,$xmm15,$r10b,$r11b,$r12b,$r13b,$r14b,$r15b,$r10bh,$r11bh,$r12bh,$r13bh,$r14bh,$r15bh,$r10d,$r11d,$r12d,$r13d,$r14d,$r15d,$r10w,$r11w,$r12w,$r13w,$r14w,$r15w,$r10wh,$r11wh,$r12wh,$r13wh,$r14wh,$r15wh), implicit $rsp, implicit $ssp, implicit $eax, implicit $ecx, implicit $edx, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax +; CHECK: CALL64pcrel32 @callee, CustomRegMask($bh,$bl,$bp,$bph,$bpl,$bx,$ebp,$ebx,$esp,$hbp,$hbx,$hsp,$rbp,$rbx,$rsp,$sp,$sph,$spl,$r10,$r11,$r12,$r13,$r14,$r15,$xmm8,$xmm9,$xmm10,$xmm11,$xmm12,$xmm13,$xmm14,$xmm15,$r10b,$r11b,$r12b,$r13b,$r14b,$r15b,$r10bh,$r11bh,$r12bh,$r13bh,$r14bh,$r15bh,$r10d,$r11d,$r12d,$r13d,$r14d,$r15d,$r10w,$r11w,$r12w,$r13w,$r14w,$r15w,$r10wh,$r11wh,$r12wh,$r13wh,$r14wh,$r15wh,$xmm8_xmm9_xmm10_xmm11,$xmm12_xmm13_xmm14_xmm15), implicit $rsp, implicit $ssp, implicit $eax, implicit $ecx, implicit $edx, implicit $edi, implicit $esi, implicit-def $rsp, implicit-def $ssp, implicit-def $eax ; CHECK: RET 0, $eax define x86_regcallcc {i32, i32, i32} @test_callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind { diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll --- a/llvm/test/CodeGen/X86/fma.ll +++ b/llvm/test/CodeGen/X86/fma.ll @@ -356,13 +356,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill @@ -380,13 +380,13 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x40] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x20] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x1c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x54] @@ -616,13 +616,13 @@ ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xb4,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill @@ -696,13 +696,13 @@ ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x54,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x40] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x50] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x6c,0x24,0x78] @@ -1071,37 +1071,37 @@ ; FMACALL32_BDVER2-NEXT: andl $-32, %esp ## encoding: [0x83,0xe4,0xe0] ; FMACALL32_BDVER2-NEXT: subl $448, %esp ## encoding: [0x81,0xec,0xc0,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: ## imm = 0x1C0 -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm4 ## encoding: [0xc5,0xf8,0x28,0x65,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc9,0x01] ; FMACALL32_BDVER2-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x5c,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0x80,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x60,0x01,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] -; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm4, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x64,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd8,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0xc0,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x54,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] @@ -1187,28 +1187,28 @@ ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x00,0x01,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x02] -; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x02] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x02] +; FMACALL32_BDVER2-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x02] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 +; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] ; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xf4,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x01] -; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x01] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x01] +; FMACALL32_BDVER2-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x01] ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18] @@ -1276,13 +1276,13 @@ ; FMACALL32_BDVER2-NEXT: calll _fmaf ## encoding: [0xe8,A,A,A,A] ; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4 ; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38] -; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm2 ## 16-byte Reload -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x94,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 ## 16-byte Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x8c,0x24,0xb0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x08,0x03] -; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm2, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x54,0x24,0x04,0x03] +; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0xc0,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm1, (%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x0c,0x24,0x03] +; FMACALL32_BDVER2-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc4,0xe3,0x79,0x17,0x44,0x24,0x04,0x03] ; FMACALL32_BDVER2-NEXT: fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x3c] ; FMACALL32_BDVER2-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Reload ; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xac,0x24,0xd0,0x00,0x00,0x00] @@ -1575,21 +1575,21 @@ ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm2, %xmm3 ## encoding: [0xc4,0xe3,0x7d,0x19,0xd3,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x94,0x24,0xc0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00] ; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xca,0x01] -; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] ; FMACALL32_BDVER2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x8c,0x24,0xa0,0x00,0x00,0x00] -; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill -; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x44,0x24,0x60] -; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0xc2] -; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm0[0],xmm2[0] +; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01] +; FMACALL32_BDVER2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x11,0x84,0x24,0x80,0x00,0x00,0x00] +; FMACALL32_BDVER2-NEXT: vmovlhps %xmm2, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc2] +; FMACALL32_BDVER2-NEXT: ## xmm0 = xmm1[0],xmm2[0] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x5c,0x24,0x70] ; FMACALL32_BDVER2-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x5c,0x24,0x10] ; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill ; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x54,0x24,0x50] +; FMACALL32_BDVER2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill +; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x4c,0x24,0x60] ; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24] ; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77] ; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A] diff --git a/llvm/test/CodeGen/X86/pr44140.ll b/llvm/test/CodeGen/X86/pr44140.ll --- a/llvm/test/CodeGen/X86/pr44140.ll +++ b/llvm/test/CodeGen/X86/pr44140.ll @@ -22,20 +22,20 @@ ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm7 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm2 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm3 ; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm1 +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 ; CHECK-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm7, {{[0-9]+}}(%rsp) -; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm5 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm4 ; CHECK-NEXT: vmovups %ymm5, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -1119,26 +1119,26 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vrcpps %ymm1, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 -; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm4 * ymm1) + ymm4 +; BDVER2-NEXT: vrcpps %ymm1, %ymm2 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3 +; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_one_step: ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 -; BTVER2-NEXT: vrcpps %ymm1, %ymm4 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 -; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vmulps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vmulps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: vaddps %ymm0, %ymm2, %ymm0 -; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 +; BTVER2-NEXT: vrcpps %ymm1, %ymm2 +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 +; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 +; BTVER2-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_one_step: diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -279,10 +279,10 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 ; BDVER2-NEXT: vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem -; BDVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1 -; BDVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 -; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4 +; BDVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; BDVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2 ; BDVER2-NEXT: vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 ; BDVER2-NEXT: retq ; @@ -290,14 +290,14 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; BTVER2-NEXT: vrcpss %xmm0, %xmm0, %xmm1 -; BTVER2-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubss %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulss %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; BTVER2-NEXT: vmulss %xmm2, %xmm1, %xmm3 ; BTVER2-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; BTVER2-NEXT: vsubss %xmm0, %xmm4, %xmm0 +; BTVER2-NEXT: vsubss %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; BTVER2-NEXT: vaddss %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq @@ -623,10 +623,10 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem -; BDVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1 -; BDVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 -; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4 +; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BDVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 +; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3 ; BDVER2-NEXT: retq ; @@ -634,14 +634,14 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 -; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; BTVER2-NEXT: vaddps %xmm2, %xmm1, %xmm1 -; BTVER2-NEXT: vmulps %xmm4, %xmm1, %xmm3 +; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] +; BTVER2-NEXT: vmulps %xmm2, %xmm1, %xmm3 ; BTVER2-NEXT: vmulps %xmm3, %xmm0, %xmm0 -; BTVER2-NEXT: vsubps %xmm0, %xmm4, %xmm0 +; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; BTVER2-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; BTVER2-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; BTVER2-NEXT: retq @@ -1000,10 +1000,10 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem -; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1 -; BDVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BDVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3 ; BDVER2-NEXT: retq ; @@ -1011,14 +1011,14 @@ ; BTVER2: # %bb.0: ; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; BTVER2-NEXT: vaddps %ymm2, %ymm1, %ymm1 -; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm3 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] +; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm0, %ymm0 -; BTVER2-NEXT: vsubps %ymm0, %ymm4, %ymm0 +; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; BTVER2-NEXT: vaddps %ymm0, %ymm3, %ymm0 ; BTVER2-NEXT: retq @@ -1192,14 +1192,14 @@ ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] -; BDVER2-NEXT: vrcpps %ymm1, %ymm5 ; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4 -; BDVER2-NEXT: vmulps %ymm3, %ymm5, %ymm4 +; BDVER2-NEXT: vrcpps %ymm1, %ymm2 +; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 -; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm4 +; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_one_step2: @@ -1580,11 +1580,11 @@ ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2 -; BDVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 -; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5 +; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BDVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 +; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4 ; BDVER2-NEXT: retq ; @@ -1604,13 +1604,13 @@ ; BTVER2-NEXT: vrcpps %ymm1, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm1, %ymm3 ; BTVER2-NEXT: vaddps %ymm0, %ymm5, %ymm0 -; BTVER2-NEXT: vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; BTVER2-NEXT: vaddps %ymm3, %ymm2, %ymm2 -; BTVER2-NEXT: vmulps %ymm5, %ymm2, %ymm4 +; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1] +; BTVER2-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; BTVER2-NEXT: vmulps %ymm4, %ymm1, %ymm1 -; BTVER2-NEXT: vsubps %ymm1, %ymm5, %ymm1 +; BTVER2-NEXT: vsubps %ymm1, %ymm3, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: vaddps %ymm1, %ymm4, %ymm1 ; BTVER2-NEXT: retq diff --git a/llvm/utils/TableGen/X86RecognizableInstr.cpp b/llvm/utils/TableGen/X86RecognizableInstr.cpp --- a/llvm/utils/TableGen/X86RecognizableInstr.cpp +++ b/llvm/utils/TableGen/X86RecognizableInstr.cpp @@ -922,6 +922,8 @@ TYPE("VR256", TYPE_YMM) TYPE("VR256X", TYPE_YMM) TYPE("VR512", TYPE_ZMM) + TYPE("VR128XQuad", TYPE_XMM_QUAD) + TYPE("VR512Quad", TYPE_ZMM_QUAD) TYPE("VK1", TYPE_VK) TYPE("VK1WM", TYPE_VK) TYPE("VK2", TYPE_VK) @@ -1097,6 +1099,8 @@ ENCODING("VR128X", ENCODING_VVVV) ENCODING("VR256X", ENCODING_VVVV) ENCODING("VR512", ENCODING_VVVV) + ENCODING("VR128XQuad", ENCODING_VVVV) + ENCODING("VR512Quad", ENCODING_VVVV) ENCODING("VK1", ENCODING_VVVV) ENCODING("VK2", ENCODING_VVVV) ENCODING("VK4", ENCODING_VVVV)