This is an archive of the discontinued LLVM Phabricator instance.

[X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128
ClosedPublic

Authored by RKSimon on Jul 18 2016, 6:17 AM.

Download Raw Diff

Details

Reviewers

spatel
ab
delena
andreadb
craig.topper

Commits

rGea0d4f9962fb: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128…
rGc8e20b115035: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128
rL276416: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128…
rL276281: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128

Summary

As reported on PR26235, we don't currently make use of the VBROADCASTF128/VBROADCASTI128 instructions to load+splat a 128-bit vector to both lanes of a 256-bit vector.

This patch enables lowering from subvector insertion/concatenation patterns and auto-upgrades the llvm.x86.avx.vbroadcastf128.pd.256 / llvm.x86.avx.vbroadcastf128.ps.256 intrinsics to match.

Once this is in place I can update _mm256_broadcast_ps and _mm256_broadcast_pd in the headers to use generic IR and remove the clang builtins.

We could possibly investigate using VBROADCASTF128/VBROADCASTI128 to load repeated constants as well (similar to how we already do for scalar broadcasts).

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon updated this revision to Diff 64310.Jul 18 2016, 6:17 AM

RKSimon retitled this revision from to [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128.

RKSimon updated this object.

RKSimon added reviewers: craig.topper, delena, spatel, andreadb.

RKSimon set the repository for this revision to rL LLVM.

RKSimon added a reviewer: ab.

RKSimon added a subscriber: llvm-commits.

delena added inline comments.Jul 19 2016, 12:19 AM

test/CodeGen/X86/vector-shuffle-256-v4.ll
1365	The selected instruction here is from AVX2 set. The same patterns should be added to AVX-512.

Added AVX512 support

delena accepted this revision.Jul 20 2016, 10:56 AM

delena edited edge metadata.

This revision is now accepted and ready to land.Jul 20 2016, 10:56 AM

Closed by commit rL276281: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128 (authored by RKSimon). · Explain WhyJul 21 2016, 7:18 AM

This revision was automatically updated to reflect the committed changes.

For the record, this CL is identified as a possible cause of https://llvm.org/bugs/show_bug.cgi?id=28657

RKSimon mentioned this in rL276417: [X86][AVX] Added support for lowering to VBROADCASTF128/VBROADCASTI128 with….Jul 22 2016, 7:06 AM

RKSimon mentioned this in rL277214: [X86][AVX] Fix VBROADCASTF128 selection bug (PR28770).Jul 29 2016, 2:13 PM

RKSimon mentioned this in D28747: [X86] Don't create VBROADCAST nodes with 256-bit or 512-bit input types.Jan 30 2017, 5:09 AM

Revision Contents

Path

Size

lib/

IR/

	AutoUpgrade.cpp
	AutoUpgrade.cpp (revision 275781)

21 lines

Target/

X86/

	X86ISelLowering.cpp
	X86ISelLowering.cpp (revision 275781)

8 lines

	X86InstrSSE.td
	X86InstrSSE.td (revision 275781)

37 lines

test/

CodeGen/

X86/

	avx-intrinsics-x86-upgrade.ll
	avx-intrinsics-x86-upgrade.ll (revision 275781)

26 lines

	avx-intrinsics-x86.ll
	avx-intrinsics-x86.ll (revision 275781)

40 lines

	avx-vbroadcastf128.ll
	avx-vbroadcastf128.ll (revision 275781)

36 lines

	avx2-intrinsics-fast-isel.ll
	avx2-intrinsics-fast-isel.ll (revision 275781)

6 lines

	avx2-vbroadcasti128.ll
	avx2-vbroadcasti128.ll (revision 275781)

36 lines

	vector-shuffle-256-v4.ll
	vector-shuffle-256-v4.ll (revision 275781)

30 lines

Diff 64310

lib/IR/AutoUpgrade.cpp

Show First 20 Lines • Show All 292 Lines • ▼ Show 20 Lines	if (IsX86 &&
Name.startswith("avx2.psrl.dq") \|\|		Name.startswith("avx2.psrl.dq") \|\|
Name.startswith("avx512.psll.dq") \|\|		Name.startswith("avx512.psll.dq") \|\|
Name.startswith("avx512.psrl.dq") \|\|		Name.startswith("avx512.psrl.dq") \|\|
Name == "sse41.pblendw" \|\|		Name == "sse41.pblendw" \|\|
Name.startswith("sse41.blendp") \|\|		Name.startswith("sse41.blendp") \|\|
Name.startswith("avx.blend.p") \|\|		Name.startswith("avx.blend.p") \|\|
Name == "avx2.pblendw" \|\|		Name == "avx2.pblendw" \|\|
Name.startswith("avx2.pblendd.") \|\|		Name.startswith("avx2.pblendd.") \|\|
		Name.startswith("avx.vbroadcastf128") \|\|
Name == "avx2.vbroadcasti128" \|\|		Name == "avx2.vbroadcasti128" \|\|
Name == "xop.vpcmov" \|\|		Name == "xop.vpcmov" \|\|
(Name.startswith("xop.vpcom") && F->arg_size() == 2))) {		(Name.startswith("xop.vpcom") && F->arg_size() == 2))) {
NewFn = nullptr;		NewFn = nullptr;
return true;		return true;
}		}
// SSE4.1 ptest functions may have an old signature.		// SSE4.1 ptest functions may have an old signature.
if (IsX86 && Name.startswith("sse41.ptest")) {		if (IsX86 && Name.startswith("sse41.ptest")) {
▲ Show 20 Lines • Show All 580 Lines • ▼ Show 20 Lines	if (IsX86 && (Name.startswith("sse2.pcmpeq.") \|\|
Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);		Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
Rep = Builder.CreateOr(Sel0, Sel1);		Rep = Builder.CreateOr(Sel0, Sel1);
} else if (IsX86 && Name == "sse42.crc32.64.8") {		} else if (IsX86 && Name == "sse42.crc32.64.8") {
Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),		Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
Intrinsic::x86_sse42_crc32_32_8);		Intrinsic::x86_sse42_crc32_32_8);
Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));		Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});		Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
Rep = Builder.CreateZExt(Rep, CI->getType(), "");		Rep = Builder.CreateZExt(Rep, CI->getType(), "");
} else if (IsX86 && Name.startswith("avx.vbroadcast")) {		} else if (IsX86 && Name.startswith("avx.vbroadcast.s")) {
// Replace broadcasts with a series of insertelements.		// Replace broadcasts with a series of insertelements.
Type *VecTy = CI->getType();		Type *VecTy = CI->getType();
Type *EltTy = VecTy->getVectorElementType();		Type *EltTy = VecTy->getVectorElementType();
unsigned EltNum = VecTy->getVectorNumElements();		unsigned EltNum = VecTy->getVectorNumElements();
Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),		Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
EltTy->getPointerTo());		EltTy->getPointerTo());
Value *Load = Builder.CreateLoad(EltTy, Cast);		Value *Load = Builder.CreateLoad(EltTy, Cast);
Type *I32Ty = Type::getInt32Ty(C);		Type *I32Ty = Type::getInt32Ty(C);
Show All 15 Lines	if (IsX86 && (Name.startswith("sse2.pcmpeq.") \|\|
ShuffleMask[i] = i;		ShuffleMask[i] = i;

Value *SV = Builder.CreateShuffleVector(		Value *SV = Builder.CreateShuffleVector(
CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);		CI->getArgOperand(0), UndefValue::get(SrcTy), ShuffleMask);

bool DoSext = (StringRef::npos != Name.find("pmovsx"));		bool DoSext = (StringRef::npos != Name.find("pmovsx"));
Rep = DoSext ? Builder.CreateSExt(SV, DstTy)		Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
: Builder.CreateZExt(SV, DstTy);		: Builder.CreateZExt(SV, DstTy);
} else if (IsX86 && Name == "avx2.vbroadcasti128") {		} else if (IsX86 && (Name.startswith("avx.vbroadcastf128") \|\|
// Replace vbroadcasts with a vector shuffle.		Name == "avx2.vbroadcasti128")) {
Type *VT = VectorType::get(Type::getInt64Ty(C), 2);		// Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
		Type *EltTy = CI->getType()->getVectorElementType();
		unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
		Type *VT = VectorType::get(EltTy, NumSrcElts);
Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),		Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
PointerType::getUnqual(VT));		PointerType::getUnqual(VT));
Value *Load = Builder.CreateLoad(VT, Op);		Value *Load = Builder.CreateLoad(VT, Op);
uint32_t Idxs[4] = { 0, 1, 0, 1 };		if (NumSrcElts == 2)
Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),		Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
Idxs);		{ 0, 1, 0, 1 });
		else
		Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
		{ 0, 1, 2, 3, 0, 1, 2, 3 });
} else if (IsX86 && (Name.startswith("avx2.pbroadcast") \|\|		} else if (IsX86 && (Name.startswith("avx2.pbroadcast") \|\|
Name.startswith("avx2.vbroadcast") \|\|		Name.startswith("avx2.vbroadcast") \|\|
Name.startswith("avx512.pbroadcast") \|\|		Name.startswith("avx512.pbroadcast") \|\|
Name.startswith("avx512.mask.broadcast.s"))) {		Name.startswith("avx512.mask.broadcast.s"))) {
// Replace vp?broadcasts with a vector shuffle.		// Replace vp?broadcasts with a vector shuffle.
Value *Op = CI->getArgOperand(0);		Value *Op = CI->getArgOperand(0);
unsigned NumElts = CI->getType()->getVectorNumElements();		unsigned NumElts = CI->getType()->getVectorNumElements();
Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);		Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
▲ Show 20 Lines • Show All 599 Lines • Show Last 20 Lines

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,789 Lines • ▼ Show 20 Lines	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();		unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
MVT OpVT = Op.getSimpleValueType();		MVT OpVT = Op.getSimpleValueType();
MVT SubVecVT = SubVec.getSimpleValueType();		MVT SubVecVT = SubVec.getSimpleValueType();

// Fold two 16-byte subvector loads into one 32-byte load:		// Fold two 16-byte subvector loads into one 32-byte load:
// (insert_subvector (insert_subvector undef, (load addr), 0),		// (insert_subvector (insert_subvector undef, (load addr), 0),
// (load addr + 16), Elts/2)		// (load addr + 16), Elts/2)
// --> load32 addr		// --> load32 addr
		// or a 16-byte broadcast:
		// (insert_subvector (insert_subvector undef, (load addr), 0),
		// (load addr), Elts/2)
		// --> X86SubVBroadcast(load16 addr)
if ((IdxVal == OpVT.getVectorNumElements() / 2) &&		if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&		Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
OpVT.is256BitVector() && SubVecVT.is128BitVector()) {		OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));		auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
if (Idx2 && Idx2->getZExtValue() == 0) {		if (Idx2 && Idx2->getZExtValue() == 0) {
// If needed, look through bitcasts to get to the load.		// If needed, look through bitcasts to get to the load.
SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));		SDValue SubVec2 = peekThroughBitcasts(Vec.getOperand(1));
if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {		if (auto *FirstLd = dyn_cast<LoadSDNode>(SubVec2)) {
bool Fast;		bool Fast;
unsigned Alignment = FirstLd->getAlignment();		unsigned Alignment = FirstLd->getAlignment();
unsigned AS = FirstLd->getAddressSpace();		unsigned AS = FirstLd->getAddressSpace();
const X86TargetLowering *TLI = Subtarget.getTargetLowering();		const X86TargetLowering *TLI = Subtarget.getTargetLowering();
if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),		if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
OpVT, AS, Alignment, &Fast) && Fast) {		OpVT, AS, Alignment, &Fast) && Fast) {
SDValue Ops[] = { SubVec2, SubVec };		SDValue Ops[] = { SubVec2, SubVec };
if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))		if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
return Ld;		return Ld;
}		}

		// If lower/upper loads are the same then lower to a VBROADCASTF128.
		if (SubVec2 == peekThroughBitcasts(SubVec))
		return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
}		}
}		}
}		}

if ((OpVT.is256BitVector() \|\| OpVT.is512BitVector()) &&		if ((OpVT.is256BitVector() \|\| OpVT.is512BitVector()) &&
SubVecVT.is128BitVector())		SubVecVT.is128BitVector())
return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);		return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);

▲ Show 20 Lines • Show All 19,089 Lines • Show Last 20 Lines

lib/Target/X86/X86InstrSSE.td

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,811 Lines • ▼ Show 20 Lines	def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
v4f32, v4f32, WriteFShuffle>;		v4f32, v4f32, WriteFShuffle>;
def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,		def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
v8f32, v4f32, WriteFShuffle256>, VEX_L;		v8f32, v4f32, WriteFShuffle256>, VEX_L;
}		}
let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in		let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,		def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;		v4f64, v2f64, WriteFShuffle256>, VEX_L;

		//===----------------------------------------------------------------------===//
		// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
		// halves of a 256-bit vector.
		//
let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in		let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),		def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
(ins i128mem:$src),		(ins i128mem:$src),
"vbroadcasti128\t{$src, $dst\|$dst, $src}", []>,		"vbroadcasti128\t{$src, $dst\|$dst, $src}", []>,
Sched<[WriteLoad]>, VEX, VEX_L;		Sched<[WriteLoad]>, VEX, VEX_L;

		let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),		def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
(ins f128mem:$src),		(ins f128mem:$src),
"vbroadcastf128\t{$src, $dst\|$dst, $src}",		"vbroadcastf128\t{$src, $dst\|$dst, $src}", []>,
[(set VR256:$dst,
(int_x86_avx_vbroadcastf128_pd_256 addr:$src))]>,
Sched<[WriteFShuffleLd]>, VEX, VEX_L;		Sched<[WriteFShuffleLd]>, VEX, VEX_L;

let Predicates = [HasAVX] in		let Predicates = [HasAVX2] in {
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),		def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
		(VBROADCASTI128 addr:$src)>;
		def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
		(VBROADCASTI128 addr:$src)>;
		def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
		(VBROADCASTI128 addr:$src)>;
		def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
		(VBROADCASTI128 addr:$src)>;
		}

		let Predicates = [HasAVX] in {
		def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
(VBROADCASTF128 addr:$src)>;		(VBROADCASTF128 addr:$src)>;
		def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
		(VBROADCASTF128 addr:$src)>;
		}

		let Predicates = [HasAVX1Only] in {
		def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
		(VBROADCASTF128 addr:$src)>;
		def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
		(VBROADCASTF128 addr:$src)>;
		def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
		(VBROADCASTF128 addr:$src)>;
		def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
		(VBROADCASTF128 addr:$src)>;
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// VINSERTF128 - Insert packed floating-point values		// VINSERTF128 - Insert packed floating-point values
//		//
let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {		let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),		def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR128:$src2, u8imm:$src3),		(ins VR256:$src1, VR128:$src2, u8imm:$src3),
"vinsertf128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",		"vinsertf128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
▲ Show 20 Lines • Show All 994 Lines • Show Last 20 Lines

test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll

	Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines
	; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>			; CHECK-NEXT: ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)			%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
	ret <2 x double> %res			ret <2 x double> %res
	}			}


				define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
				; CHECK-LABEL: test_x86_avx_vbroadcastf128_pd_256:
				; CHECK: ## BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
				; CHECK-NEXT: retl
				%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
				ret <4 x double> %res
				}
				declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly


				define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
				; CHECK-LABEL: test_x86_avx_vbroadcastf128_ps_256:
				; CHECK: ## BB#0:
				; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
				; CHECK-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
				; CHECK-NEXT: retl
				%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
				ret <8 x float> %res
				}
				declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly


	define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {			define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
	; CHECK-LABEL: test_x86_avx_blend_pd_256:			; CHECK-LABEL: test_x86_avx_blend_pd_256:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]			; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]			%res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
	ret <4 x double> %res			ret <4 x double> %res
	}			}
	▲ Show 20 Lines • Show All 276 Lines • ▼ Show 20 Lines
	declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone			declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone


	define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {			define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
	; add operation forces the execution domain.			; add operation forces the execution domain.
	; CHECK-LABEL: test_x86_sse2_storeu_dq:			; CHECK-LABEL: test_x86_sse2_storeu_dq:
	; CHECK: ## BB#0:			; CHECK: ## BB#0:
	; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax			; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
	; CHECK-NEXT: vpaddb LCPI34_0, %xmm0, %xmm0			; CHECK-NEXT: vpaddb LCPI36_0, %xmm0, %xmm0
	; CHECK-NEXT: vmovdqu %xmm0, (%eax)			; CHECK-NEXT: vmovdqu %xmm0, (%eax)
	; CHECK-NEXT: retl			; CHECK-NEXT: retl
	%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>			%a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
	call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)			call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
	ret void			ret void
	}			}
	declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind			declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind

	▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

test/CodeGen/X86/avx-intrinsics-x86.ll

Show First 20 Lines • Show All 3,931 Lines • ▼ Show 20 Lines
; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0		; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0
; AVX512VL-NEXT: retl		; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]		%res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res		ret <8 x float> %res
}		}
declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone		declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone


define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
; AVX-LABEL: test_x86_avx_vbroadcastf128_pd_256:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_pd_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: retl
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
ret <4 x double> %res
}
declare <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8*) nounwind readonly


define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
; AVX-LABEL: test_x86_avx_vbroadcastf128_ps_256:
; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX-NEXT: retl
;
; AVX512VL-LABEL: test_x86_avx_vbroadcastf128_ps_256:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX512VL-NEXT: retl
%res = call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %a0) ; <<8 x float>> [#uses=1]
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly


define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {		define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
; AVX-LABEL: test_x86_avx_vperm2f128_pd_256:		; AVX-LABEL: test_x86_avx_vperm2f128_pd_256:
; AVX: ## BB#0:		; AVX: ## BB#0:
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]		; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1]
; AVX-NEXT: retl		; AVX-NEXT: retl
;		;
; AVX512VL-LABEL: test_x86_avx_vperm2f128_pd_256:		; AVX512VL-LABEL: test_x86_avx_vperm2f128_pd_256:
; AVX512VL: ## BB#0:		; AVX512VL: ## BB#0:
▲ Show 20 Lines • Show All 563 Lines • ▼ Show 20 Lines	; AVX512VL-NEXT: retl
ret i32 %tmp		ret i32 %tmp
}		}
declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind		declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind

define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {		define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
; AVX-LABEL: movnt_dq:		; AVX-LABEL: movnt_dq:
; AVX: ## BB#0:		; AVX: ## BB#0:
; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax		; AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0		; AVX-NEXT: vpaddq LCPI252_0, %xmm0, %xmm0
; AVX-NEXT: vmovntdq %ymm0, (%eax)		; AVX-NEXT: vmovntdq %ymm0, (%eax)
; AVX-NEXT: vzeroupper		; AVX-NEXT: vzeroupper
; AVX-NEXT: retl		; AVX-NEXT: retl
;		;
; AVX512VL-LABEL: movnt_dq:		; AVX512VL-LABEL: movnt_dq:
; AVX512VL: ## BB#0:		; AVX512VL: ## BB#0:
; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax		; AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX512VL-NEXT: vpaddq LCPI254_0, %xmm0, %xmm0		; AVX512VL-NEXT: vpaddq LCPI252_0, %xmm0, %xmm0
; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)		; AVX512VL-NEXT: vmovntdq %ymm0, (%eax)
; AVX512VL-NEXT: retl		; AVX512VL-NEXT: retl
%a2 = add <2 x i64> %a1, <i64 1, i64 1>		%a2 = add <2 x i64> %a1, <i64 1, i64 1>
%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>		%a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind		tail call void @llvm.x86.avx.movnt.dq.256(i8* %p, <4 x i64> %a3) nounwind
ret void		ret void
}		}
declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind		declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

test/CodeGen/X86/avx-vbroadcastf128.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx \| FileCheck %s --check-prefix=X32			; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx \| FileCheck %s --check-prefix=X32
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx \| FileCheck %s --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx \| FileCheck %s --check-prefix=X64

	define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {			define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
	; X32-LABEL: test_broadcast_2f64_4f64:			; X32-LABEL: test_broadcast_2f64_4f64:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_2f64_4f64:			; X64-LABEL: test_broadcast_2f64_4f64:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <2 x double>, <2 x double> *%p			%1 = load <2 x double>, <2 x double> *%p
	%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>			%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	ret <4 x double> %2			ret <4 x double> %2
	}			}

	define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {			define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
	; X32-LABEL: test_broadcast_2i64_4i64:			; X32-LABEL: test_broadcast_2i64_4i64:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_2i64_4i64:			; X64-LABEL: test_broadcast_2i64_4i64:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <2 x i64>, <2 x i64> *%p			%1 = load <2 x i64>, <2 x i64> *%p
	%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>			%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	ret <4 x i64> %2			ret <4 x i64> %2
	}			}

	define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {			define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
	; X32-LABEL: test_broadcast_4f32_8f32:			; X32-LABEL: test_broadcast_4f32_8f32:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_4f32_8f32:			; X64-LABEL: test_broadcast_4f32_8f32:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <4 x float>, <4 x float> *%p			%1 = load <4 x float>, <4 x float> *%p
	%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	ret <8 x float> %2			ret <8 x float> %2
	}			}

	define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {			define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
	; X32-LABEL: test_broadcast_4i32_8i32:			; X32-LABEL: test_broadcast_4i32_8i32:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_4i32_8i32:			; X64-LABEL: test_broadcast_4i32_8i32:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <4 x i32>, <4 x i32> *%p			%1 = load <4 x i32>, <4 x i32> *%p
	%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	ret <8 x i32> %2			ret <8 x i32> %2
	}			}

	define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {			define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
	; X32-LABEL: test_broadcast_8i16_16i16:			; X32-LABEL: test_broadcast_8i16_16i16:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_8i16_16i16:			; X64-LABEL: test_broadcast_8i16_16i16:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <8 x i16>, <8 x i16> *%p			%1 = load <8 x i16>, <8 x i16> *%p
	%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	ret <16 x i16> %2			ret <16 x i16> %2
	}			}

	define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {			define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
	; X32-LABEL: test_broadcast_16i8_32i7:			; X32-LABEL: test_broadcast_16i8_32i7:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_16i8_32i7:			; X64-LABEL: test_broadcast_16i8_32i7:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <16 x i8>, <16 x i8> *%p			%1 = load <16 x i8>, <16 x i8> *%p
	%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	ret <32 x i8> %2			ret <32 x i8> %2
	}			}

test/CodeGen/X86/avx2-intrinsics-fast-isel.ll

Show First 20 Lines • Show All 499 Lines • ▼ Show 20 Lines	; X64-NEXT: retq
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %res		ret <4 x i64> %res
}		}

define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {		define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
; X32-LABEL: test_mm256_broadcastsi128_si256_mem:		; X32-LABEL: test_mm256_broadcastsi128_si256_mem:
; X32: # BB#0:		; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax		; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vmovaps (%eax), %xmm0		; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-NEXT: retl		; X32-NEXT: retl
;		;
; X64-LABEL: test_mm256_broadcastsi128_si256_mem:		; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
; X64: # BB#0:		; X64: # BB#0:
; X64-NEXT: vmovaps (%rdi), %xmm0		; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq		; X64-NEXT: retq
%a0 = load <2 x i64>, <2 x i64>* %p0		%a0 = load <2 x i64>, <2 x i64>* %p0
%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %res		ret <4 x i64> %res
}		}

define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {		define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
; X32-LABEL: test_mm_broadcastss_ps:		; X32-LABEL: test_mm_broadcastss_ps:
▲ Show 20 Lines • Show All 2,865 Lines • Show Last 20 Lines

test/CodeGen/X86/avx2-vbroadcasti128.ll

	; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
	; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 \| FileCheck %s --check-prefix=X32			; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 \| FileCheck %s --check-prefix=X32
	; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 \| FileCheck %s --check-prefix=X64			; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 \| FileCheck %s --check-prefix=X64

	define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {			define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
	; X32-LABEL: test_broadcast_2f64_4f64:			; X32-LABEL: test_broadcast_2f64_4f64:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovapd (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vaddpd LCPI0_0, %ymm0, %ymm0			; X32-NEXT: vaddpd LCPI0_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_2f64_4f64:			; X64-LABEL: test_broadcast_2f64_4f64:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovapd (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vaddpd {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <2 x double>, <2 x double> *%p			%1 = load <2 x double>, <2 x double> *%p
	%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>			%2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	%3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>			%3 = fadd <4 x double> %2, <double 1.0, double 2.0, double 3.0, double 4.0>
	ret <4 x double> %3			ret <4 x double> %3
	}			}

	define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {			define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
	; X32-LABEL: test_broadcast_2i64_4i64:			; X32-LABEL: test_broadcast_2i64_4i64:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovdqa (%eax), %xmm0			; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0			; X32-NEXT: vpaddq LCPI1_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_2i64_4i64:			; X64-LABEL: test_broadcast_2i64_4i64:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovdqa (%rdi), %xmm0			; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vpaddq {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <2 x i64>, <2 x i64> *%p			%1 = load <2 x i64>, <2 x i64> *%p
	%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>			%2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
	%3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>			%3 = add <4 x i64> %2, <i64 1, i64 2, i64 3, i64 4>
	ret <4 x i64> %3			ret <4 x i64> %3
	}			}

	define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {			define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
	; X32-LABEL: test_broadcast_4f32_8f32:			; X32-LABEL: test_broadcast_4f32_8f32:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovaps (%eax), %xmm0			; X32-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vaddps LCPI2_0, %ymm0, %ymm0			; X32-NEXT: vaddps LCPI2_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_4f32_8f32:			; X64-LABEL: test_broadcast_4f32_8f32:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovaps (%rdi), %xmm0			; X64-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <4 x float>, <4 x float> *%p			%1 = load <4 x float>, <4 x float> *%p
	%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	%3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>			%3 = fadd <8 x float> %2, <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>
	ret <8 x float> %3			ret <8 x float> %3
	}			}

	define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {			define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
	; X32-LABEL: test_broadcast_4i32_8i32:			; X32-LABEL: test_broadcast_4i32_8i32:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovdqa (%eax), %xmm0			; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vpaddd LCPI3_0, %ymm0, %ymm0			; X32-NEXT: vpaddd LCPI3_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_4i32_8i32:			; X64-LABEL: test_broadcast_4i32_8i32:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovdqa (%rdi), %xmm0			; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <4 x i32>, <4 x i32> *%p			%1 = load <4 x i32>, <4 x i32> *%p
	%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>			%2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
	%3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>			%3 = add <8 x i32> %2, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
	ret <8 x i32> %3			ret <8 x i32> %3
	}			}

	define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {			define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
	; X32-LABEL: test_broadcast_8i16_16i16:			; X32-LABEL: test_broadcast_8i16_16i16:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovdqa (%eax), %xmm0			; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vpaddw LCPI4_0, %ymm0, %ymm0			; X32-NEXT: vpaddw LCPI4_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_8i16_16i16:			; X64-LABEL: test_broadcast_8i16_16i16:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovdqa (%rdi), %xmm0			; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vpaddw {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <8 x i16>, <8 x i16> *%p			%1 = load <8 x i16>, <8 x i16> *%p
	%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>			%2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
	%3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>			%3 = add <16 x i16> %2, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16>
	ret <16 x i16> %3			ret <16 x i16> %3
	}			}

	define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {			define <32 x i8> @test_broadcast_16i8_32i7(<16 x i8> *%p) nounwind {
	; X32-LABEL: test_broadcast_16i8_32i7:			; X32-LABEL: test_broadcast_16i8_32i7:
	; X32: ## BB#0:			; X32: ## BB#0:
	; X32-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-NEXT: vmovdqa (%eax), %xmm0			; X32-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X32-NEXT: vpaddb LCPI5_0, %ymm0, %ymm0			; X32-NEXT: vpaddb LCPI5_0, %ymm0, %ymm0
	; X32-NEXT: retl			; X32-NEXT: retl
	;			;
	; X64-LABEL: test_broadcast_16i8_32i7:			; X64-LABEL: test_broadcast_16i8_32i7:
	; X64: ## BB#0:			; X64: ## BB#0:
	; X64-NEXT: vmovdqa (%rdi), %xmm0			; X64-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
	; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
	; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0			; X64-NEXT: vpaddb {{.*}}(%rip), %ymm0, %ymm0
	; X64-NEXT: retq			; X64-NEXT: retq
	%1 = load <16 x i8>, <16 x i8> *%p			%1 = load <16 x i8>, <16 x i8> *%p
	%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>			%2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
	%3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32>			%3 = add <32 x i8> %2, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32>
	ret <32 x i8> %3			ret <32 x i8> %3
	}			}

test/CodeGen/X86/vector-shuffle-256-v4.ll

Show First 20 Lines • Show All 1,346 Lines • ▼ Show 20 Lines	; ALL-NEXT: retq
%v = load <2 x double>, <2 x double>* %ptr		%v = load <2 x double>, <2 x double>* %ptr
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>		%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {		define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
; AVX1-LABEL: splat128_mem_v4i64_from_v2i64:		; AVX1-LABEL: splat128_mem_v4i64_from_v2i64:
; AVX1: # BB#0:		; AVX1: # BB#0:
; AVX1-NEXT: vmovaps (%rdi), %xmm0		; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq		; AVX1-NEXT: retq
;		;
; AVX2-LABEL: splat128_mem_v4i64_from_v2i64:		; AVX2-LABEL: splat128_mem_v4i64_from_v2i64:
; AVX2: # BB#0:		; AVX2: # BB#0:
; AVX2-NEXT: vmovaps (%rdi), %xmm0		; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq		; AVX2-NEXT: retq
;		;
; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:		; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
; AVX512VL: # BB#0:		; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0		; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
		delenaUnsubmitted Not Done Reply Inline Actions The selected instruction here is from AVX2 set. The same patterns should be added to AVX-512. delena: The selected instruction here is from AVX2 set. The same patterns should be added to AVX-512.
; AVX512VL-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: retq		; AVX512VL-NEXT: retq
%v = load <2 x i64>, <2 x i64>* %ptr		%v = load <2 x i64>, <2 x i64>* %ptr
%shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x i64> %shuffle		ret <4 x i64> %shuffle
}		}

define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {		define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
; AVX1-LABEL: splat128_mem_v4f64_from_v2f64:		; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
; AVX1: # BB#0:		; ALL: # BB#0:
; AVX1-NEXT: vmovaps (%rdi), %xmm0		; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0		; ALL-NEXT: retq
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat128_mem_v4f64_from_v2f64:
; AVX2: # BB#0:
; AVX2-NEXT: vmovaps (%rdi), %xmm0
; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: splat128_mem_v4f64_from_v2f64:
; AVX512VL: # BB#0:
; AVX512VL-NEXT: vmovapd (%rdi), %xmm0
; AVX512VL-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0
; AVX512VL-NEXT: retq
%v = load <2 x double>, <2 x double>* %ptr		%v = load <2 x double>, <2 x double>* %ptr
%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>		%shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
ret <4 x double> %shuffle		ret <4 x double> %shuffle
}		}

define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {		define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:		; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64:
; AVX1: # BB#0:		; AVX1: # BB#0:
▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines