This is an archive of the discontinued LLVM Phabricator instance.

AVX-512 ERI Instrinsics for scalar instructions
Needs ReviewPublic

Authored by delena on Nov 23 2014, 6:13 AM.

Download Raw Diff

Details

Reviewers

anemet
rob.khasanov

Summary

Added a full coverage for scalar ERI intrinsics, including SAE mode and memory operand.
Added AVX512_maskable_scalar template, that should cover all scalar instructions in the future.

The main difference between AVX512_maskable_scalar<> and AVX512_maskable<> is using X86select instead of vselect.
I need it, because I can't create vselect node for MVT::i1 mask for scalar instruction.

Diff Detail

Event Timeline

delena updated this revision to Diff 16538.Nov 23 2014, 6:13 AM

delena retitled this revision from to AVX-512 ERI Instrinsics for scalar instructions.

delena updated this object.

delena edited the test plan for this revision. (Show Details)

delena added reviewers: anemet, rob.khasanov.

delena set the repository for this revision to rL LLVM.

delena added a subscriber: Unknown Object (MLST).

Sorry about the delay responding, I was on vacation. I see you already committed this. I think this looks good. I just have a few comments that would be good to fix.

lib/Target/X86/X86ISelLowering.cpp
16802–16805	Please add comment, especially the difference from getVectorMaskingNode that you explained in the patch comment.
lib/Target/X86/X86InstrAVX512.td
26–31	I think that this needs a comment and a better name. This is more like the number of elements in the RC not in the VT, right? You should probably also add a comment before X86VTVectorInfo that for scalar types in vector registers they are essentially treated as occupying the entire 128-bit vector register with the appropriate number of upper elements ignored (probably with some examples).
124–126	Why not FR32X and FR64X for RC?

Hi Adam, I'm sorry for the delay. I'm going to upload a new diff with all comments inside.

lib/Target/X86/X86InstrAVX512.td
124–126	I can't just change to FR32X, I have compilation errors in this case: VRSQRT28SSrkz: (set FR32X:<empty>:$dst, (X86select:<empty> VK1WM:i1:$mask, (X86rsqrt28s:v4f32 FR32X:<empty>:$src1, FR32X:v4f32:$src2, (imm:i32)<<P:Predicate_FROUND_CURRENT>>), (bitconvert:v4f32 (build_vector:v4i32)<<P:Predicate_immAllZerosV>>))) Included from lib/Target/X86/X86.td:432: Included from lib/Target/X86/X86InstrInfo.td:2387: lib/Target/X86/X86InstrAVX512.td:4253:3: error: In VRSQRT28SSrkz: Type inference contradiction found, merging 'f32' into 'v4f32' defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; In this case I should rewrite the full conception of FP scalars, used also in AVX and AVX2. Not sure that it is what we need now.

anemet added inline comments.Dec 8 2014, 10:13 AM

lib/Target/X86/X86InstrAVX512.td
124–126	OK. I made this comment when I still thought that these _info object were wrapping the scalar types. A better way to think about them is the "closest" vector type so that masking can be applied.

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

28 lines

X86InstrAVX512.td

131 lines

X86InstrFragmentsSIMD.td

7 lines

X86IntrinsicsInfo.h

6 lines

test/

CodeGen/

X86/

avx512er-intrinsics.ll

41 lines

Diff 16538

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 16,793 Lines • ▼ Show 20 Lines	switch (Op.getOpcode()) {
case X86ISD::CMPMU:		case X86ISD::CMPMU:
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);		return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
}		}
if (PreservedSrc.getOpcode() == ISD::UNDEF)		if (PreservedSrc.getOpcode() == ISD::UNDEF)
PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);		PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);		return DAG.getNode(ISD::VSELECT, dl, VT, VMask, Op, PreservedSrc);
}		}

		static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
		SDValue PreservedSrc,
		const X86Subtarget *Subtarget,
		SelectionDAG &DAG) {
		anemetUnsubmitted Not Done Reply Inline Actions Please add comment, especially the difference from getVectorMaskingNode that you explained in the patch comment. anemet: Please add comment, especially the difference from getVectorMaskingNode that you explained in…
		EVT VT = Op.getValueType();
		SDLoc dl(Op);
		if (isAllOnes(Mask))
		return Op;

		// The mask should be of type MVT::i1
		SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);

		if (PreservedSrc.getOpcode() == ISD::UNDEF)
		PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
		return DAG.getNode(X86ISD::SELECT, dl, VT, IMask, Op, PreservedSrc);
		}

static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {		static unsigned getOpcodeForFMAIntrinsic(unsigned IntNo) {
switch (IntNo) {		switch (IntNo) {
default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.		default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
case Intrinsic::x86_fma_vfmadd_ps:		case Intrinsic::x86_fma_vfmadd_ps:
case Intrinsic::x86_fma_vfmadd_pd:		case Intrinsic::x86_fma_vfmadd_pd:
case Intrinsic::x86_fma_vfmadd_ps_256:		case Intrinsic::x86_fma_vfmadd_ps_256:
case Intrinsic::x86_fma_vfmadd_pd_256:		case Intrinsic::x86_fma_vfmadd_pd_256:
case Intrinsic::x86_fma_mask_vfmadd_ps_512:		case Intrinsic::x86_fma_mask_vfmadd_ps_512:
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	case INTR_TYPE_1OP_MASK_RM: {
SDValue Src = Op.getOperand(1);		SDValue Src = Op.getOperand(1);
SDValue Src0 = Op.getOperand(2);		SDValue Src0 = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);		SDValue Mask = Op.getOperand(3);
SDValue RoundingMode = Op.getOperand(4);		SDValue RoundingMode = Op.getOperand(4);
return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,		return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
RoundingMode),		RoundingMode),
Mask, Src0, Subtarget, DAG);		Mask, Src0, Subtarget, DAG);
}		}
		case INTR_TYPE_SCALAR_MASK_RM: {
		SDValue Src1 = Op.getOperand(1);
		SDValue Src2 = Op.getOperand(2);
		SDValue Src0 = Op.getOperand(3);
		SDValue Mask = Op.getOperand(4);
		SDValue RoundingMode = Op.getOperand(5);
		return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
		RoundingMode),
		Mask, Src0, Subtarget, DAG);
		}
case CMP_MASK:		case CMP_MASK:
case CMP_MASK_CC: {		case CMP_MASK_CC: {
// Comparison intrinsics with masks.		// Comparison intrinsics with masks.
// Example of transformation:		// Example of transformation:
// (i8 (int_x86_avx512_mask_pcmpeq_q_128		// (i8 (int_x86_avx512_mask_pcmpeq_q_128
// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->		// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
// (i8 (bitcast		// (i8 (bitcast
// (v8i1 (insert_subvector undef,		// (v8i1 (insert_subvector undef,
▲ Show 20 Lines • Show All 9,493 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrAVX512.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show All 17 Lines	class X86VectorVTInfo<int numelts, ValueType EltVT, RegisterClass rc,
// !lt in tablegen.		// !lt in tablegen.
RegisterClass MRC =		RegisterClass MRC =
!cast<RegisterClass>("GR" #		!cast<RegisterClass>("GR" #
!if (!eq (!srl(NumElts, 3), 0), 8, NumElts));		!if (!eq (!srl(NumElts, 3), 0), 8, NumElts));

// Suffix used in the instruction mnemonic.		// Suffix used in the instruction mnemonic.
string Suffix = suffix;		string Suffix = suffix;

string VTName = "v" # NumElts # EltVT;		int NumEltsInVT = !if (!eq (NumElts, 1),
		!if (!eq (EltVT.Size, 32), 4,
		!if (!eq (EltVT.Size, 64), 2, NumElts)), NumElts);

		string VTName = "v" # NumEltsInVT # EltVT;

		anemetUnsubmitted Not Done Reply Inline Actions I think that this needs a comment and a better name. This is more like the number of elements in the RC not in the VT, right? You should probably also add a comment before X86VTVectorInfo that for scalar types in vector registers they are essentially treated as occupying the entire 128-bit vector register with the appropriate number of upper elements ignored (probably with some examples). anemet: I think that this needs a comment and a better name. This is more like the number of elements…
// The vector VT.		// The vector VT.
ValueType VT = !cast<ValueType>(VTName);		ValueType VT = !cast<ValueType>(VTName);

string EltTypeName = !cast<string>(EltVT);		string EltTypeName = !cast<string>(EltVT);
// Size of the element type in bits, e.g. 32 for v16i32.		// Size of the element type in bits, e.g. 32 for v16i32.
string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));		string EltSizeName = !subst("i", "", !subst("f", "", EltTypeName));
int EltSize = EltVT.Size;		int EltSize = EltVT.Size;

Show All 16 Lines	PatFrag LdFrag = !cast<PatFrag>("load" #
!if (!eq (Size, 256), "v4i64",		!if (!eq (Size, 256), "v4i64",
VTName)), VTName));		VTName)), VTName));
PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);		PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);

// Load patterns used for memory operands. We only have this defined in		// Load patterns used for memory operands. We only have this defined in
// case of i64 element types for sub-512 integer vectors. For now, keep		// case of i64 element types for sub-512 integer vectors. For now, keep
// MemOpFrag undefined in these cases.		// MemOpFrag undefined in these cases.
PatFrag MemOpFrag =		PatFrag MemOpFrag =
		!if (!eq (NumElts#EltTypeName, "1f32"), !cast<PatFrag>("memopfsf32"),
		!if (!eq (NumElts#EltTypeName, "1f64"), !cast<PatFrag>("memopfsf64"),
!if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName),		!if (!eq (TypeVariantName, "f"), !cast<PatFrag>("memop" # VTName),
!if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName),		!if (!eq (EltTypeName, "i64"), !cast<PatFrag>("memop" # VTName),
!if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?)));		!if (!eq (VTName, "v16i32"), !cast<PatFrag>("memop" # VTName), ?)))));

// The corresponding float type, e.g. v16f32 for v16i32		// The corresponding float type, e.g. v16f32 for v16i32
// Note: For EltSize < 32, FloatVT is illegal and TableGen		// Note: For EltSize < 32, FloatVT is illegal and TableGen
// fails to compile, so we choose FloatVT = VT		// fails to compile, so we choose FloatVT = VT
ValueType FloatVT = !cast<ValueType>(		ValueType FloatVT = !cast<ValueType>(
!if (!eq (!srl(EltSize,5),0),		!if (!eq (!srl(EltSize,5),0),
VTName,		VTName,
!if (!eq(TypeVariantName, "i"),		!if (!eq(TypeVariantName, "i"),
Show All 38 Lines

def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;		def v16i8x_info : X86VectorVTInfo<16, i8, VR128X, "b">;
def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;		def v8i16x_info : X86VectorVTInfo<8, i16, VR128X, "w">;
def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;		def v4i32x_info : X86VectorVTInfo<4, i32, VR128X, "d">;
def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;		def v2i64x_info : X86VectorVTInfo<2, i64, VR128X, "q">;
def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;		def v4f32x_info : X86VectorVTInfo<4, f32, VR128X, "ps">;
def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;		def v2f64x_info : X86VectorVTInfo<2, f64, VR128X, "pd">;

		// the scalar staff
		def f32x_info : X86VectorVTInfo<1, f32, VR128X, "ss">;
		def f64x_info : X86VectorVTInfo<1, f64, VR128X, "sd">;

		anemetUnsubmitted Not Done Reply Inline Actions Why not FR32X and FR64X for RC? anemet: Why not FR32X and FR64X for RC?
		delenaAuthorUnsubmitted Not Done Reply Inline Actions I can't just change to FR32X, I have compilation errors in this case: VRSQRT28SSrkz: (set FR32X:<empty>:$dst, (X86select:<empty> VK1WM:i1:$mask, (X86rsqrt28s:v4f32 FR32X:<empty>:$src1, FR32X:v4f32:$src2, (imm:i32)<<P:Predicate_FROUND_CURRENT>>), (bitconvert:v4f32 (build_vector:v4i32)<<P:Predicate_immAllZerosV>>))) Included from lib/Target/X86/X86.td:432: Included from lib/Target/X86/X86InstrInfo.td:2387: lib/Target/X86/X86InstrAVX512.td:4253:3: error: In VRSQRT28SSrkz: Type inference contradiction found, merging 'f32' into 'v4f32' defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; In this case I should rewrite the full conception of FP scalars, used also in AVX and AVX2. Not sure that it is what we need now. delena: I can't just change to FR32X, I have compilation errors in this case: VRSQRT28SSrkz: (set…
		anemetUnsubmitted Not Done Reply Inline Actions OK. I made this comment when I still thought that these _info object were wrapping the scalar types. A better way to think about them is the "closest" vector type so that masking can be applied. anemet: OK. I made this comment when I still thought that these _info object were wrapping the scalar…
class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,		class AVX512VLVectorVTInfo<X86VectorVTInfo i512, X86VectorVTInfo i256,
X86VectorVTInfo i128> {		X86VectorVTInfo i128> {
X86VectorVTInfo info512 = i512;		X86VectorVTInfo info512 = i512;
X86VectorVTInfo info256 = i256;		X86VectorVTInfo info256 = i256;
X86VectorVTInfo info128 = i128;		X86VectorVTInfo info128 = i128;
}		}

def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,		def avx512vl_i8_info : AVX512VLVectorVTInfo<v64i8_info, v32i8x_info,
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines

// Common base class of AVX512_maskable and AVX512_maskable_3src.		// Common base class of AVX512_maskable and AVX512_maskable_3src.
multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,		multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs,		dag Outs,
dag Ins, dag MaskingIns, dag ZeroMaskingIns,		dag Ins, dag MaskingIns, dag ZeroMaskingIns,
string OpcodeStr,		string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,		string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,		dag RHS, dag MaskingRHS,
string Round = "",		SDNode Select = vselect, string Round = "",
string MaskingConstraint = "",		string MaskingConstraint = "",
InstrItinClass itin = NoItinerary,		InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> :		bit IsCommutable = 0> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,		AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,		AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],		[(set _.RC:$dst, RHS)],
[(set _.RC:$dst, MaskingRHS)],		[(set _.RC:$dst, MaskingRHS)],
[(set _.RC:$dst,		[(set _.RC:$dst,
(vselect _.KRCWM:$mask, RHS, _.ImmAllZerosV))],		(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
Round, MaskingConstraint, NoItinerary, IsCommutable>;		Round, MaskingConstraint, NoItinerary, IsCommutable>;

// This multiclass generates the unconditional/non-masking, the masking and		// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the instruction. In the masking case, the		// the zero-masking variant of the vector instruction. In the masking case, the
// perserved vector elements come from a new dummy input operand tied to $dst.		// perserved vector elements come from a new dummy input operand tied to $dst.
multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,		multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,		dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,		string AttSrcAsm, string IntelSrcAsm,
dag RHS, string Round = "",		dag RHS, string Round = "",
InstrItinClass itin = NoItinerary,		InstrItinClass itin = NoItinerary,
bit IsCommutable = 0> :		bit IsCommutable = 0> :
AVX512_maskable_common<O, F, _, Outs, Ins,		AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),		!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),		!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,		OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
(vselect _.KRCWM:$mask, RHS, _.RC:$src0), Round,		(vselect _.KRCWM:$mask, RHS, _.RC:$src0), vselect,
"$src0 = $dst", itin, IsCommutable>;		Round, "$src0 = $dst", itin, IsCommutable>;

		// This multiclass generates the unconditional/non-masking, the masking and
		// the zero-masking variant of the scalar instruction.
		multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
		dag Outs, dag Ins, string OpcodeStr,
		string AttSrcAsm, string IntelSrcAsm,
		dag RHS, string Round = "",
		InstrItinClass itin = NoItinerary,
		bit IsCommutable = 0> :
		AVX512_maskable_common<O, F, _, Outs, Ins,
		!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
		!con((ins _.KRCWM:$mask), Ins),
		OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
		(X86select _.KRCWM:$mask, RHS, _.RC:$src0), X86select,
		Round, "$src0 = $dst", itin, IsCommutable>;

// Similar to AVX512_maskable but in this case one of the source operands		// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved		// ($src1) is already tied to $dst so we just use that for the preserved
// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude		// vector elements. NOTE that the NonTiedIns (the ins dag) should exclude
// $src1.		// $src1.
multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,		multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,		dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,		string AttSrcAsm, string IntelSrcAsm,
▲ Show 20 Lines • Show All 3,979 Lines • ▼ Show 20 Lines
def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),		def : Pat <(v16f32 (int_x86_avx512_rcp14_ps_512 (v16f32 VR512:$src),
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),		(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
(VRCP14PSZr VR512:$src)>;		(VRCP14PSZr VR512:$src)>;
def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),		def : Pat <(v8f64 (int_x86_avx512_rcp14_pd_512 (v8f64 VR512:$src),
(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),		(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
(VRCP14PDZr VR512:$src)>;		(VRCP14PDZr VR512:$src)>;

/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd		/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr, RegisterClass RC,		multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
X86MemOperand x86memop> {		SDNode OpNode> {
let hasSideEffects = 0, Predicates = [HasERI] in {
def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr,
" \t{$src2, $src1, $dst\|$dst, $src1, $src2}"), []>, EVEX_4V;
def rrb : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr,
" \t{{sae}, $src2, $src1, $dst\|$dst, $src1, $src2, {sae}}"),
[]>, EVEX_4V, EVEX_B;
let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr,
" \t{$src2, $src1, $dst\|$dst, $src1, $src2}"), []>, EVEX_4V;
}
}
}

defm VRCP28SS : avx512_fp28_s<0xCB, "vrcp28ss", FR32X, f32mem>,
EVEX_CD8<32, CD8VT1>;
defm VRCP28SD : avx512_fp28_s<0xCB, "vrcp28sd", FR64X, f64mem>,
VEX_W, EVEX_CD8<64, CD8VT1>;
defm VRSQRT28SS : avx512_fp28_s<0xCD, "vrsqrt28ss", FR32X, f32mem>,
EVEX_CD8<32, CD8VT1>;
defm VRSQRT28SD : avx512_fp28_s<0xCD, "vrsqrt28sd", FR64X, f64mem>,
VEX_W, EVEX_CD8<64, CD8VT1>;

def : Pat <(v4f32 (int_x86_avx512_rcp28_ss (v4f32 VR128X:$src1),		defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),		(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
FROUND_NO_EXC)),		"$src2, $src1", "$src1, $src2",
(COPY_TO_REGCLASS (VRCP28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),		(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;		(i32 FROUND_CURRENT))>;

def : Pat <(v2f64 (int_x86_avx512_rcp28_sd (v2f64 VR128X:$src1),		defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),		(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
FROUND_NO_EXC)),		"$src2, $src1", "$src1, $src2",
(COPY_TO_REGCLASS (VRCP28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),		(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;		(i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;

def : Pat <(v4f32 (int_x86_avx512_rsqrt28_ss (v4f32 VR128X:$src1),		defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(v4f32 VR128X:$src2), (bc_v4f32 (v4i32 immAllZerosV)), (i8 -1),		(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
FROUND_NO_EXC)),		"$src2, $src1", "$src1, $src2",
(COPY_TO_REGCLASS (VRSQRT28SSrrb (COPY_TO_REGCLASS VR128X:$src1, FR32X),		(OpNode (_.VT _.RC:$src1),
(COPY_TO_REGCLASS VR128X:$src2, FR32X)), VR128X)>;		(_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
		(i32 FROUND_CURRENT))>;
		}

def : Pat <(v2f64 (int_x86_avx512_rsqrt28_sd (v2f64 VR128X:$src1),		multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
(v2f64 VR128X:$src2), (bc_v2f64 (v4i32 immAllZerosV)), (i8 -1),		defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
FROUND_NO_EXC)),		EVEX_CD8<32, CD8VT1>;
(COPY_TO_REGCLASS (VRSQRT28SDrrb (COPY_TO_REGCLASS VR128X:$src1, FR64X),		defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
(COPY_TO_REGCLASS VR128X:$src2, FR64X)), VR128X)>;		EVEX_CD8<64, CD8VT1>, VEX_W;
		}

		let hasSideEffects = 0, Predicates = [HasERI] in {
		defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
		defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
		}
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd		/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd

multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,		multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
SDNode OpNode> {		SDNode OpNode> {

defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),		defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",		(ins _.RC:$src), OpcodeStr, "$src", "$src",
(OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;		(OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;

defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),		defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,		(ins _.RC:$src), OpcodeStr,
"$src", "$src",		"$src", "$src",
(OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), "{sae}">, EVEX_B;		(OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
		"{sae}">, EVEX_B;

defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),		defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",		(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.FloatVT		(OpNode (_.FloatVT
(bitconvert (_.LdFrag addr:$src))), (i32 FROUND_CURRENT))>;		(bitconvert (_.LdFrag addr:$src))),
		(i32 FROUND_CURRENT))>;

defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),		defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",		(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.FloatVT		(OpNode (_.FloatVT
(X86VBroadcast (_.ScalarLdFrag addr:$src))),		(X86VBroadcast (_.ScalarLdFrag addr:$src))),
(i32 FROUND_CURRENT))>, EVEX_B;		(i32 FROUND_CURRENT))>, EVEX_B;
}		}

▲ Show 20 Lines • Show All 952 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrFragmentsSIMD.td

	Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines

	def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,			def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;			SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;

	def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,			def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
	SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;			SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
	def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,			def STDFp1SrcRm : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>,
	SDTCisVec<0>, SDTCisInt<2>]>;			SDTCisVec<0>, SDTCisInt<2>]>;
				def STDFp2SrcRm : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
				SDTCisVec<0>, SDTCisInt<3>]>;

	def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;			def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
	def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;			def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;

	def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;			def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
	def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;			def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
	def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;			def X86PShuflw : SDNode<"X86ISD::PSHUFLW", SDTShuff2OpI>;

	▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines
	def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;			def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
	def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;			def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;
	def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;			def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
	def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;			def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFma>;
	def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;			def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFma>;

	def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>;			def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", STDFp1SrcRm>;
	def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>;			def X86rcp28 : SDNode<"X86ISD::RCP28", STDFp1SrcRm>;
	def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>;			def X86exp2 : SDNode<"X86ISD::EXP2", STDFp1SrcRm>;

				def X86rsqrt28s : SDNode<"X86ISD::RSQRT28", STDFp2SrcRm>;
				def X86rcp28s : SDNode<"X86ISD::RCP28", STDFp2SrcRm>;

	def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,			def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,			SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
	SDTCisVT<4, i8>]>;			SDTCisVT<4, i8>]>;
	def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,			def SDT_PCMPESTRI : SDTypeProfile<2, 5, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
	SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,			SDTCisVT<2, v16i8>, SDTCisVT<3, i32>,
	SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,			SDTCisVT<4, v16i8>, SDTCisVT<5, i32>,
	SDTCisVT<6, i8>]>;			SDTCisVT<6, i8>]>;

	▲ Show 20 Lines • Show All 310 Lines • Show Last 20 Lines

lib/Target/X86/X86IntrinsicsInfo.h

Show All 15 Lines

namespace llvm {		namespace llvm {

enum IntrinsicType {		enum IntrinsicType {
INTR_NO_TYPE,		INTR_NO_TYPE,
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,		GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,		INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,		CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
INTR_TYPE_1OP_MASK_RM		INTR_TYPE_1OP_MASK_RM, INTR_TYPE_SCALAR_MASK_RM
};		};

struct IntrinsicData {		struct IntrinsicData {

unsigned Id;		unsigned Id;
IntrinsicType Type;		IntrinsicType Type;
unsigned Opc0;		unsigned Opc0;
unsigned Opc1;		unsigned Opc1;
▲ Show 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),		X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),		X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),		X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RCP28, 0),
		X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0),
		X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),		X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),		X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
		X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
		X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),		X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),		X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),		X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),		X86_INTRINSIC_DATA(avx_hsub_ps_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),		X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),		X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),		X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),		X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
▲ Show 20 Lines • Show All 93 Lines • Show Last 20 Lines

test/CodeGen/X86/avx512er-intrinsics.ll

	Show First 20 Lines • Show All 58 Lines • ▼ Show 20 Lines
	define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {			define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
	; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]			; CHECK: vexp2pd %zmm0, %zmm0 {sae} # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
	%res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)			%res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
	ret <8 x double> %res			ret <8 x double> %res
	}			}
	declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone			declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone

	define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {			define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
	; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]			; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
	%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]			%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone			declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

	define <4 x float> @test_rcp28_ss(<4 x float> %a0) {			define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
	; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]			; CHECK: vrcp28ss %xmm0, %xmm0, %xmm0 {sae} # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
	%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]			%res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone			declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone

				define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
				; CHECK: vrsqrt28ss %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
				%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ;
				ret <4 x float> %res
				}

				define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
				; CHECK: vrsqrt28ss %xmm1, %xmm0, %xmm2 {%k1}{sae} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
				%res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
				ret <4 x float> %res
				}

				define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
				; CHECK: vrsqrt28sd %xmm0, %xmm0, %xmm0 {%k1} {z}{sae} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
				%res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ;
				ret <2 x double> %res
				}

				declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone

				define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) {
				; CHECK: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
				%mem = load double * %ptr, align 8
				%mem_v = insertelement <2 x double> undef, double %mem, i32 0
				%res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
				ret <2 x double> %res
				}

				define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) {
				; CHECK: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
				%ptr1 = getelementptr double* %ptr, i32 18
				%mem = load double * %ptr1, align 8
				%mem_v = insertelement <2 x double> undef, double %mem, i32 0
				%res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
				ret <2 x double> %res
				}