This is an archive of the discontinued LLVM Phabricator instance.

[LLVM][X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation
ClosedPublic

Authored by RKSimon on Apr 6 2017, 9:04 AM.

Download Raw Diff

Details

Reviewers

spatel
delena
andreadb
craig.topper
m_zuckerman

Commits

rG5a22eaa2bf4e: [X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation (LLVM)
rL300325: [X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation (LLVM)

Summary

MOVNTDQA non-temporal aligned vector loads can be correctly represented using generic builtin loads, allowing us to remove the existing x86 intrinsics.

The Clang companion patch can be found at D31766

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Apr 6 2017, 9:04 AM

craig.topper added inline comments.Apr 7 2017, 9:53 PM

lib/IR/AutoUpgrade.cpp
1878 ↗	(On Diff #94380)	I know we already do this in several places in this file, but why do we need a SmallVector of 1? Won't a single Metadata * variable auto convert to ArrayRef for the MDNode::get call?
1884 ↗	(On Diff #94380)	Do we need a VectorType for the two calls that use this or could it just be generic Type*?

RKSimon added inline comments.Apr 10 2017, 10:59 AM

lib/IR/AutoUpgrade.cpp
1878 ↗	(On Diff #94380)	OK - I'll fix it in this one and fix the others as followups.
1884 ↗	(On Diff #94380)	IIRC we need the VectorType for the getBitWidth() call.

craig.topper added inline comments.Apr 10 2017, 11:14 AM

lib/IR/AutoUpgrade.cpp
1884 ↗	(On Diff #94380)	Right, forgot getBitWidth has to multiply the elements with the scalar width.

Updated based on Craig's feedback

LGTM.

Did we really not have tests for the avx512 and sse41 versions?

This revision is now accepted and ready to land.Apr 11 2017, 9:29 AM

In D31767#723836, @craig.topper wrote:

LGTM.

Did we really not have tests for the avx512 and sse41 versions?

Nope and I'm not sure why not.

Closed by commit rL300325: [X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation (LLVM) (authored by RKSimon). · Explain WhyApr 14 2017, 8:18 AM

This revision was automatically updated to reflect the committed changes.

RKSimon mentioned this in rL300326: [X86][SSE] Update MOVNTDQA non-temporal loads to generic implementation (clang).

Revision Contents

Path

Size

llvm/

trunk/

include/

llvm/

IR/

IntrinsicsX86.td

12 lines

lib/

IR/

AutoUpgrade.cpp

17 lines

Target/

X86/

X86InstrAVX512.td

9 lines

X86InstrSSE.td

9 lines

test/

CodeGen/

X86/

avx2-intrinsics-x86-upgrade.ll

14 lines

avx2-intrinsics-x86.ll

46 lines

avx512-intrinsics-upgrade.ll

11 lines

sse41-intrinsics-x86-upgrade.ll

13 lines

Diff 95302

llvm/trunk/include/llvm/IR/IntrinsicsX86.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 779 Lines • ▼ Show 20 Lines

// Vector sum of absolute differences		// Vector sum of absolute differences
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".		let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">,		def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">,
Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],		Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty],
[IntrNoMem, Commutative]>;		[IntrNoMem, Commutative]>;
}		}

// Cacheability support ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa">,
Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}

// Test instruction with bitwise comparison.		// Test instruction with bitwise comparison.
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".		let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_ptestz : GCCBuiltin<"__builtin_ia32_ptestz128">,		def int_x86_sse41_ptestz : GCCBuiltin<"__builtin_ia32_ptestz128">,
Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],		Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
[IntrNoMem]>;		[IntrNoMem]>;
def int_x86_sse41_ptestc : GCCBuiltin<"__builtin_ia32_ptestc128">,		def int_x86_sse41_ptestc : GCCBuiltin<"__builtin_ia32_ptestc128">,
Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],		Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty],
[IntrNoMem]>;		[IntrNoMem]>;
▲ Show 20 Lines • Show All 1,539 Lines • ▼ Show 20 Lines	let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,		def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;		Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,		def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,		Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem]>;		llvm_v32i8_ty], [IntrNoMem]>;
def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,		def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,		Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
llvm_i8_ty], [IntrNoMem, Commutative]>;		llvm_i8_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// FMA3 and FMA4		// FMA3 and FMA4

let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".		let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_fma_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,		def int_x86_fma_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
Intrinsic<[llvm_v4f32_ty],		Intrinsic<[llvm_v4f32_ty],
▲ Show 20 Lines • Show All 3,981 Lines • ▼ Show 20 Lines	let TargetPrefix = "x86" in {
def int_x86_avx512_mask_cmp_ss :		def int_x86_avx512_mask_cmp_ss :
GCCBuiltin<"__builtin_ia32_cmpss_mask">,		GCCBuiltin<"__builtin_ia32_cmpss_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,		Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;		llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_cmp_sd :		def int_x86_avx512_mask_cmp_sd :
GCCBuiltin<"__builtin_ia32_cmpsd_mask">,		GCCBuiltin<"__builtin_ia32_cmpsd_mask">,
Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,		Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;		llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;

def int_x86_avx512_movntdqa :
GCCBuiltin<"__builtin_ia32_movntdqa512">,
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// SHA intrinsics		// SHA intrinsics
let TargetPrefix = "x86" in {		let TargetPrefix = "x86" in {
def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,		def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">,
Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],		Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
[IntrNoMem]>;		[IntrNoMem]>;
Show All 32 Lines

llvm/trunk/lib/IR/AutoUpgrade.cpp

Show First 20 Lines • Show All 196 Lines • ▼ Show 20 Lines	if (Name.startswith("sse2.pcmpeq.") \|\| // Added in 3.1
Name == "avx2.vinserti128" \|\| // Added in 3.7		Name == "avx2.vinserti128" \|\| // Added in 3.7
Name.startswith("avx512.mask.insert") \|\| // Added in 4.0		Name.startswith("avx512.mask.insert") \|\| // Added in 4.0
Name.startswith("avx.vextractf128.") \|\| // Added in 3.7		Name.startswith("avx.vextractf128.") \|\| // Added in 3.7
Name == "avx2.vextracti128" \|\| // Added in 3.7		Name == "avx2.vextracti128" \|\| // Added in 3.7
Name.startswith("avx512.mask.vextract") \|\| // Added in 4.0		Name.startswith("avx512.mask.vextract") \|\| // Added in 4.0
Name.startswith("sse4a.movnt.") \|\| // Added in 3.9		Name.startswith("sse4a.movnt.") \|\| // Added in 3.9
Name.startswith("avx.movnt.") \|\| // Added in 3.2		Name.startswith("avx.movnt.") \|\| // Added in 3.2
Name.startswith("avx512.storent.") \|\| // Added in 3.9		Name.startswith("avx512.storent.") \|\| // Added in 3.9
		Name == "sse41.movntdqa" \|\| // Added in 5.0
		Name == "avx2.movntdqa" \|\| // Added in 5.0
		Name == "avx512.movntdqa" \|\| // Added in 5.0
Name == "sse2.storel.dq" \|\| // Added in 3.9		Name == "sse2.storel.dq" \|\| // Added in 3.9
Name.startswith("sse.storeu.") \|\| // Added in 3.9		Name.startswith("sse.storeu.") \|\| // Added in 3.9
Name.startswith("sse2.storeu.") \|\| // Added in 3.9		Name.startswith("sse2.storeu.") \|\| // Added in 3.9
Name.startswith("avx.storeu.") \|\| // Added in 3.9		Name.startswith("avx.storeu.") \|\| // Added in 3.9
Name.startswith("avx512.mask.storeu.") \|\| // Added in 3.9		Name.startswith("avx512.mask.storeu.") \|\| // Added in 3.9
Name.startswith("avx512.mask.store.p") \|\| // Added in 3.9		Name.startswith("avx512.mask.store.p") \|\| // Added in 3.9
Name.startswith("avx512.mask.store.b.") \|\| // Added in 3.9		Name.startswith("avx512.mask.store.b.") \|\| // Added in 3.9
Name.startswith("avx512.mask.store.w.") \|\| // Added in 3.9		Name.startswith("avx512.mask.store.w.") \|\| // Added in 3.9
▲ Show 20 Lines • Show All 1,657 Lines • ▼ Show 20 Lines	if (IsX86 && (Name.startswith("sse2.pcmp") \|\|
else		else
llvm_unreachable("Unexpected vpermilvar intrinsic");		llvm_unreachable("Unexpected vpermilvar intrinsic");

Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);		Function *Intrin = Intrinsic::getDeclaration(F->getParent(), IID);
Rep = Builder.CreateCall(Intrin,		Rep = Builder.CreateCall(Intrin,
{ CI->getArgOperand(0), CI->getArgOperand(1) });		{ CI->getArgOperand(0), CI->getArgOperand(1) });
Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,		Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
CI->getArgOperand(2));		CI->getArgOperand(2));
		} else if (IsX86 && Name.endswith(".movntdqa")) {
		Module *M = F->getParent();
		MDNode *Node = MDNode::get(
		C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));

		Value *Ptr = CI->getArgOperand(0);
		VectorType *VTy = cast<VectorType>(CI->getType());

		// Convert the type of the pointer to a pointer to the stored type.
		Value *BC =
		Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast");
		LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8);
		LI->setMetadata(M->getMDKindID("nontemporal"), Node);
		Rep = LI;
} else if (IsNVVM && (Name == "abs.i" \|\| Name == "abs.ll")) {		} else if (IsNVVM && (Name == "abs.i" \|\| Name == "abs.ll")) {
Value *Arg = CI->getArgOperand(0);		Value *Arg = CI->getArgOperand(0);
Value *Neg = Builder.CreateNeg(Arg, "neg");		Value *Neg = Builder.CreateNeg(Arg, "neg");
Value *Cmp = Builder.CreateICmpSGE(		Value *Cmp = Builder.CreateICmpSGE(
Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");		Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");		Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
} else if (IsNVVM && (Name == "max.i" \|\| Name == "max.ll" \|\|		} else if (IsNVVM && (Name == "max.i" \|\| Name == "max.ll" \|\|
Name == "max.ui" \|\| Name == "max.ull")) {		Name == "max.ui" \|\| Name == "max.ull")) {
▲ Show 20 Lines • Show All 379 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrAVX512.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 3,629 Lines • ▼ Show 20 Lines	def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;		(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
}		}
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// AVX-512 - Non-temporals		// AVX-512 - Non-temporals
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
let SchedRW = [WriteLoad] in {		let SchedRW = [WriteLoad] in {
def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),		def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
(ins i512mem:$src), "vmovntdqa\t{$src, $dst\|$dst, $src}",		(ins i512mem:$src), "vmovntdqa\t{$src, $dst\|$dst, $src}",
[(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],		[], SSEPackedInt>, EVEX, T8PD, EVEX_V512,
SSEPackedInt>, EVEX, T8PD, EVEX_V512,
EVEX_CD8<64, CD8VF>;		EVEX_CD8<64, CD8VF>;

let Predicates = [HasVLX] in {		let Predicates = [HasVLX] in {
def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),		def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
(ins i256mem:$src),		(ins i256mem:$src),
"vmovntdqa\t{$src, $dst\|$dst, $src}",		"vmovntdqa\t{$src, $dst\|$dst, $src}",
[(set VR256X:$dst, (int_x86_avx2_movntdqa addr:$src))],		[], SSEPackedInt>, EVEX, T8PD, EVEX_V256,
SSEPackedInt>, EVEX, T8PD, EVEX_V256,
EVEX_CD8<64, CD8VF>;		EVEX_CD8<64, CD8VF>;

def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),		def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
(ins i128mem:$src),		(ins i128mem:$src),
"vmovntdqa\t{$src, $dst\|$dst, $src}",		"vmovntdqa\t{$src, $dst\|$dst, $src}",
[(set VR128X:$dst, (int_x86_sse41_movntdqa addr:$src))],		[], SSEPackedInt>, EVEX, T8PD, EVEX_V128,
SSEPackedInt>, EVEX, T8PD, EVEX_V128,
EVEX_CD8<64, CD8VF>;		EVEX_CD8<64, CD8VF>;
}		}
}		}

multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,		multiclass avx512_movnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
PatFrag st_frag = alignednontemporalstore,		PatFrag st_frag = alignednontemporalstore,
InstrItinClass itin = IIC_SSE_MOVNT> {		InstrItinClass itin = IIC_SSE_MOVNT> {
let SchedRW = [WriteStore], AddedComplexity = 400 in		let SchedRW = [WriteStore], AddedComplexity = 400 in
▲ Show 20 Lines • Show All 5,763 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,092 Lines • ▼ Show 20 Lines	def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
(v2f64 VR128:$src2))),		(v2f64 VR128:$src2))),
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;		(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
}		}

let AddedComplexity = 400 in { // Prefer non-temporal versions		let AddedComplexity = 400 in { // Prefer non-temporal versions
let SchedRW = [WriteLoad] in {		let SchedRW = [WriteLoad] in {
let Predicates = [HasAVX, NoVLX] in		let Predicates = [HasAVX, NoVLX] in
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),		def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovntdqa\t{$src, $dst\|$dst, $src}",		"vmovntdqa\t{$src, $dst\|$dst, $src}", []>,
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
VEX, VEX_WIG;		VEX, VEX_WIG;
let Predicates = [HasAVX2, NoVLX] in		let Predicates = [HasAVX2, NoVLX] in
def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),		def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
"vmovntdqa\t{$src, $dst\|$dst, $src}",		"vmovntdqa\t{$src, $dst\|$dst, $src}", []>,
[(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
VEX, VEX_L, VEX_WIG;		VEX, VEX_L, VEX_WIG;
def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),		def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movntdqa\t{$src, $dst\|$dst, $src}",		"movntdqa\t{$src, $dst\|$dst, $src}", []>;
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
} // SchedRW		} // SchedRW

let Predicates = [HasAVX2, NoVLX] in {		let Predicates = [HasAVX2, NoVLX] in {
def : Pat<(v8f32 (alignednontemporalload addr:$src)),		def : Pat<(v8f32 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;		(VMOVNTDQAYrm addr:$src)>;
def : Pat<(v4f64 (alignednontemporalload addr:$src)),		def : Pat<(v4f64 (alignednontemporalload addr:$src)),
(VMOVNTDQAYrm addr:$src)>;		(VMOVNTDQAYrm addr:$src)>;
def : Pat<(v4i64 (alignednontemporalload addr:$src)),		def : Pat<(v4i64 (alignednontemporalload addr:$src)),
▲ Show 20 Lines • Show All 1,574 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll

	Show All 28 Lines
	; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]			; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]			%res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
	ret <8 x i32> %res			ret <8 x i32> %res
	}			}
	declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone			declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone


				define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
				; CHECK-LABEL: test_x86_avx2_movntdqa:
				; CHECK: ## BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: vmovntdqa (%eax), %ymm0
				; CHECK-NEXT: retl
				%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
				ret <4 x i64> %res
				}
				declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly


	define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {			define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
	; CHECK-LABEL: test_x86_avx2_mpsadbw:			; CHECK-LABEL: test_x86_avx2_mpsadbw:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0			; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]			%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
	ret <16 x i16> %res			ret <16 x i16> %res
	}			}
	▲ Show 20 Lines • Show All 320 Lines • ▼ Show 20 Lines
	declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone			declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone

	; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions			; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
	define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {			define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
	; add operation forces the execution domain.			; add operation forces the execution domain.
	; CHECK-LABEL: test_x86_avx_storeu_dq_256:			; CHECK-LABEL: test_x86_avx_storeu_dq_256:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax			; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: vpaddb LCPI33_0, %ymm0, %ymm0			; CHECK-NEXT: vpaddb LCPI34_0, %ymm0, %ymm0
	; CHECK-NEXT: vmovdqu %ymm0, (%eax)			; CHECK-NEXT: vmovdqu %ymm0, (%eax)
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>			%a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
	call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)			call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
	ret void			ret void
	}			}
	declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind			declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
	▲ Show 20 Lines • Show All 121 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx2-intrinsics-x86.ll

Show First 20 Lines • Show All 830 Lines • ▼ Show 20 Lines
; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]		; CHECK-NEXT: vpsignw %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x09,0xc1]
; CHECK-NEXT: retl ## encoding: [0xc3]		; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]		%res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res		ret <16 x i16> %res
}		}
declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone		declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone


define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
; AVX2-LABEL: test_x86_avx2_movntdqa:
; AVX2: ## BB#0:
; AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX2-NEXT: vmovntdqa (%eax), %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x2a,0x00]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_movntdqa:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; AVX512VL-NEXT: vmovntdqa (%eax), %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2a,0x00]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly


define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {		define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: test_x86_avx2_mpsadbw:		; CHECK-LABEL: test_x86_avx2_mpsadbw:
; CHECK: ## BB#0:		; CHECK: ## BB#0:
; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]		; CHECK-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x42,0xc1,0x07]
; CHECK-NEXT: retl ## encoding: [0xc3]		; CHECK-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]		%res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i8 7) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res		ret <16 x i16> %res
}		}
▲ Show 20 Lines • Show All 488 Lines • ▼ Show 20 Lines	; AVX512VL-NEXT: retl ## encoding: [0xc3]
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {		define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrav_d_const:		; AVX2-LABEL: test_x86_avx2_psrav_d_const:
; AVX2: ## BB#0:		; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]		; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]		; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4		; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]		; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4		; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]		; AVX2-NEXT: retl ## encoding: [0xc3]
;		;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:		; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
; AVX512VL: ## BB#0:		; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vmovdqa LCPI91_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]		; AVX512VL-NEXT: vmovdqa LCPI90_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]		; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI91_0, kind: FK_Data_4		; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI91_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]		; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI91_1, kind: FK_Data_4		; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]		; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)		%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
ret <4 x i32> %res		ret <4 x i32> %res
}		}
declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone		declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone

define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {		define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrav_d_256:		; AVX2-LABEL: test_x86_avx2_psrav_d_256:
Show All 9 Lines	; AVX512VL-NEXT: retl ## encoding: [0xc3]
ret <8 x i32> %res		ret <8 x i32> %res
}		}

define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) {		define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1) {
; AVX2-LABEL: test_x86_avx2_psrav_d_256_const:		; AVX2-LABEL: test_x86_avx2_psrav_d_256_const:
; AVX2: ## BB#0:		; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]		; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]		; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4		; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]		; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4		; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]		; AVX2-NEXT: retl ## encoding: [0xc3]
;		;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:		; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
; AVX512VL: ## BB#0:		; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vmovdqa LCPI93_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]		; AVX512VL-NEXT: vmovdqa LCPI92_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]		; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI93_0, kind: FK_Data_4		; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI93_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]		; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI93_1, kind: FK_Data_4		; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]		; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)		%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
ret <8 x i32> %res		ret <8 x i32> %res
}		}
declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone		declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone

define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {		define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1, <4 x i32> %idx, <2 x double> %mask) {
; CHECK-LABEL: test_x86_avx2_gather_d_pd:		; CHECK-LABEL: test_x86_avx2_gather_d_pd:
▲ Show 20 Lines • Show All 238 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/avx512-intrinsics-upgrade.ll

	Show First 20 Lines • Show All 3,055 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)			%res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
	%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)			%res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
	%res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)			%res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
	%res3 = add <8 x i64> %res, %res1			%res3 = add <8 x i64> %res, %res1
	%res4 = add <8 x i64> %res2, %res3			%res4 = add <8 x i64> %res2, %res3
	ret <8 x i64> %res4			ret <8 x i64> %res4
	}			}

				define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) {
				; CHECK-LABEL: test_x86_avx512_movntdqa:
				; CHECK: ## BB#0:
				; CHECK-NEXT: vmovntdqa (%rdi), %zmm0
				; CHECK-NEXT: retq
				%res = call <8 x i64> @llvm.x86.avx512.movntdqa(i8* %a0)
				ret <8 x i64> %res
				}

				declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*) nounwind readonly

llvm/trunk/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll

	Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]			; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3]
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 17) ; <<4 x float>> [#uses=1]			%res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i32 17) ; <<4 x float>> [#uses=1]
	ret <4 x float> %res			ret <4 x float> %res
	}			}
	declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone			declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone


				define <2 x i64> @test_x86_sse41_movntdqa(<2 x i64>* %a0) {
				; CHECK-LABEL: test_x86_sse41_movntdqa:
				; CHECK: ## BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: movntdqa (%eax), %xmm0
				; CHECK-NEXT: retl
				%arg0 = bitcast <2 x i64>* %a0 to i8*
				%res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
				ret <2 x i64> %res
				}
				declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone


	define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {			define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
	; CHECK-LABEL: test_x86_sse41_mpsadbw:			; CHECK-LABEL: test_x86_sse41_mpsadbw:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: mpsadbw $7, %xmm1, %xmm0			; CHECK-NEXT: mpsadbw $7, %xmm1, %xmm0
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]			%res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i32 7) ; <<8 x i16>> [#uses=1]
	ret <8 x i16> %res			ret <8 x i16> %res
	}			}
	▲ Show 20 Lines • Show All 233 Lines • Show Last 20 Lines