Diff 263181

llvm/lib/CodeGen/CodeGenPrepare.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 6,428 Lines • ▼ Show 20 Lines

	/// Some targets have expensive vector shifts if the lanes aren't all the same			/// Some targets have expensive vector shifts if the lanes aren't all the same
	/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases			/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
	/// it's often worth sinking a shufflevector splat down to its use so that			/// it's often worth sinking a shufflevector splat down to its use so that
	/// codegen can spot all lanes are identical.			/// codegen can spot all lanes are identical.
	bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {			bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
	BasicBlock *DefBB = SVI->getParent();			BasicBlock *DefBB = SVI->getParent();

	// Only do this xform if variable vector shifts are particularly expensive.			// Only do this transform if variable vector shifts are expensive.
				// We are also using shift cost as a proxy for rotate and funnel shift ops.
	if (!TLI->isVectorShiftByScalarCheap(SVI->getType()))			if (!TLI->isVectorShiftByScalarCheap(SVI->getType()))
				RKSimonUnsubmitted Done Reply Inline Actions Please can you mention that we use it for funnels/rotates in isVectorShiftByScalarCheap descriptions in TargetLowering.h/X86ISelLowering.cpp RKSimon: Please can you mention that we use it for funnels/rotates in isVectorShiftByScalarCheap…
	return false;			return false;

	// We only expect better codegen by sinking a shuffle if we can recognise a			// We only expect better codegen by sinking a shuffle if we can recognise a
	// constant splat.			// constant splat.
	if (!isBroadcastShuffle(SVI))			if (!isBroadcastShuffle(SVI))
	return false;			return false;

	// InsertedShuffles - Only insert a shuffle in each block once.			// InsertedShuffles - Only insert a shuffle in each block once.
	DenseMap<BasicBlock, Instruction> InsertedShuffles;			DenseMap<BasicBlock, Instruction> InsertedShuffles;

	bool MadeChange = false;			bool MadeChange = false;
	for (User *U : SVI->users()) {			for (User *U : SVI->users()) {
	Instruction *UI = cast<Instruction>(U);			Instruction *UI = cast<Instruction>(U);

	// Figure out which BB this ext is used in.			// Figure out which BB this ext is used in.
	BasicBlock *UserBB = UI->getParent();			BasicBlock *UserBB = UI->getParent();
	if (UserBB == DefBB) continue;			if (UserBB == DefBB) continue;

	// For now only apply this when the splat is used by a shift instruction.			// Check if the splat is used by a shift or funnel-shift instruction.
	if (!UI->isShift()) continue;			if (!UI->isShift()) {
				auto *II = dyn_cast<IntrinsicInst>(UI);
				if (!II \|\| (II->getIntrinsicID() != Intrinsic::fshl &&
				II->getIntrinsicID() != Intrinsic::fshr))
				continue;
				}

	// Everything checks out, sink the shuffle if the user's block doesn't			// Everything checks out, sink the shuffle if the user's block doesn't
	// already have a copy.			// already have a copy.
	Instruction *&InsertedShuffle = InsertedShuffles[UserBB];			Instruction *&InsertedShuffle = InsertedShuffles[UserBB];

	if (!InsertedShuffle) {			if (!InsertedShuffle) {
	BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();			BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
	assert(InsertPt != UserBB->end());			assert(InsertPt != UserBB->end());
	▲ Show 20 Lines • Show All 1,270 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-fshl-128.ll

	Show First 20 Lines • Show All 2,137 Lines • ▼ Show 20 Lines

	; CGP should allow a cross-block splat shift amount to be seen in SDAG.			; CGP should allow a cross-block splat shift amount to be seen in SDAG.
	; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426			; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426

	define void @sink_splatvar(i32* %p, i32 %shift_amt) {			define void @sink_splatvar(i32* %p, i32 %shift_amt) {
	; SSE2-LABEL: sink_splatvar:			; SSE2-LABEL: sink_splatvar:
	; SSE2: # %bb.0: # %entry			; SSE2: # %bb.0: # %entry
	; SSE2-NEXT: movd %esi, %xmm0			; SSE2-NEXT: movd %esi, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00			; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
	; SSE2-NEXT: pand {{.*}}(%rip), %xmm0			; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: pslld $23, %xmm0			; SSE2-NEXT: andl $31, %ecx
	; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0			; SSE2-NEXT: movl $32, %edx
	; SSE2-NEXT: cvttps2dq %xmm0, %xmm0			; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE2-NEXT: movd %edx, %xmm0
				; SSE2-NEXT: movd %ecx, %xmm1
	; SSE2-NEXT: .p2align 4, 0x90			; SSE2-NEXT: .p2align 4, 0x90
	; SSE2-NEXT: .LBB8_1: # %loop			; SSE2-NEXT: .LBB8_1: # %loop
	; SSE2-NEXT: # =>This Inner Loop Header: Depth=1			; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
	; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2			; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; SSE2-NEXT: movdqa %xmm2, %xmm3
	; SSE2-NEXT: pmuludq %xmm0, %xmm2			; SSE2-NEXT: psrld %xmm0, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]			; SSE2-NEXT: pslld %xmm1, %xmm2
	; SSE2-NEXT: pmuludq %xmm1, %xmm3			; SSE2-NEXT: por %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; SSE2-NEXT: por %xmm4, %xmm2
	; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)			; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
	; SSE2-NEXT: addq $16, %rax			; SSE2-NEXT: addq $16, %rax
	; SSE2-NEXT: jne .LBB8_1			; SSE2-NEXT: jne .LBB8_1
	; SSE2-NEXT: # %bb.2: # %end			; SSE2-NEXT: # %bb.2: # %end
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: sink_splatvar:			; SSE41-LABEL: sink_splatvar:
	; SSE41: # %bb.0: # %entry			; SSE41: # %bb.0: # %entry
	; SSE41-NEXT: movd %esi, %xmm0			; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00			; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00
	; SSE41-NEXT: pand {{.*}}(%rip), %xmm0			; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
	; SSE41-NEXT: pslld $23, %xmm0			; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
	; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
	; SSE41-NEXT: cvttps2dq %xmm0, %xmm0			; SSE41-NEXT: psubd %xmm1, %xmm0
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
				; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
	; SSE41-NEXT: .p2align 4, 0x90			; SSE41-NEXT: .p2align 4, 0x90
	; SSE41-NEXT: .LBB8_1: # %loop			; SSE41-NEXT: .LBB8_1: # %loop
	; SSE41-NEXT: # =>This Inner Loop Header: Depth=1			; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
	; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2			; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2
	; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; SSE41-NEXT: movdqa %xmm2, %xmm3
	; SSE41-NEXT: pmuludq %xmm1, %xmm3			; SSE41-NEXT: psrld %xmm0, %xmm3
	; SSE41-NEXT: pmuludq %xmm0, %xmm2			; SSE41-NEXT: pslld %xmm1, %xmm2
	; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]			; SSE41-NEXT: por %xmm3, %xmm2
	; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]			; SSE41-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
	; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
	; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
	; SSE41-NEXT: por %xmm4, %xmm3
	; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax)
	; SSE41-NEXT: addq $16, %rax			; SSE41-NEXT: addq $16, %rax
	; SSE41-NEXT: jne .LBB8_1			; SSE41-NEXT: jne .LBB8_1
	; SSE41-NEXT: # %bb.2: # %end			; SSE41-NEXT: # %bb.2: # %end
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: sink_splatvar:			; AVX1-LABEL: sink_splatvar:
	; AVX1: # %bb.0: # %entry			; AVX1: # %bb.0: # %entry
	; AVX1-NEXT: vmovd %esi, %xmm0			; AVX1-NEXT: vmovd %esi, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00			; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
	; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vpslld $23, %xmm0, %xmm0			; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
	; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0			; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [32,32,32,32]
	; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0			; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
				; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
				craig.topperUnsubmitted Not Done Reply Inline Actions Why are we splatting the scalar here when only element 0 is used? craig.topper: Why are we splatting the scalar here when only element 0 is used?
				spatelAuthorUnsubmitted Done Reply Inline Actions This is another limitation caused by the block-level visibility - SDAG doesn't know that the splat is from a scalar because we are only sinking the shuffle instruction, not the insertelement: t12: v4i32,ch = CopyFromReg t0, Register:v4i32 %0 t14: v4i32 = vector_shuffle<0,0,0,0> t12, undef:v4i32 The splat doesn't get hoisted back out of the loop until later in MachineLICM, and there's apparently no really late analysis for demanded elements. We could try to sink insertelement to shuffles. That should probably be another patch though. spatel: This is another limitation caused by the block-level visibility - SDAG doesn't know that the…
				craig.topperUnsubmitted Not Done Reply Inline Actions I'm still confused. Shouldn't demandedelts inside selectiondag have determined the splat shuffle was unnecessary regardless of it coming from an insertelement? craig.topper: I'm still confused. Shouldn't demandedelts inside selectiondag have determined the splat…
				spatelAuthorUnsubmitted Done Reply Inline Actions Ah, I see. Starting from the x86 shift nodes, we should see that we only need the low chunk. I didn't step through, but there are many potential candidates here that would foil the analysis: too many intervening nodes, casts to different sizes, and/or multiple uses: t14: v4i32 = vector_shuffle<0,0,0,0> t12, undef:v4i32 t45: v4i32 = BUILD_VECTOR Constant:i32<31>, Constant:i32<31>, Constant:i32<31>, Constant:i32<31> t46: v4i32 = and t14, t45 t25: ch = CopyToReg t0, Register:i64 %2, t23 t54: v2i64 = zero_extend_vector_inreg t46 t55: v4i32 = bitcast t54 t56: v4i32 = X86ISD::VSHL t10, t55 t48: v4i32 = BUILD_VECTOR Constant:i32<32>, Constant:i32<32>, Constant:i32<32>, Constant:i32<32> t49: v4i32 = sub t48, t46 t58: v2i64 = zero_extend_vector_inreg t49 t59: v4i32 = bitcast t58 t60: v4i32 = X86ISD::VSRL t10, t59 spatel: Ah, I see. Starting from the x86 shift nodes, we should see that we only need the low chunk. I…
	; AVX1-NEXT: .p2align 4, 0x90			; AVX1-NEXT: .p2align 4, 0x90
	; AVX1-NEXT: .LBB8_1: # %loop			; AVX1-NEXT: .LBB8_1: # %loop
	; AVX1-NEXT: # =>This Inner Loop Header: Depth=1			; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
	; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2			; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; AVX1-NEXT: vpsrld %xmm0, %xmm2, %xmm3
	; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3			; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
	; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2			; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
	; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
	; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)			; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
	; AVX1-NEXT: addq $16, %rax			; AVX1-NEXT: addq $16, %rax
	; AVX1-NEXT: jne .LBB8_1			; AVX1-NEXT: jne .LBB8_1
	; AVX1-NEXT: # %bb.2: # %end			; AVX1-NEXT: # %bb.2: # %end
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: sink_splatvar:			; AVX2-LABEL: sink_splatvar:
	; AVX2: # %bb.0: # %entry			; AVX2: # %bb.0: # %entry
	▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
	; XOPAVX2-NEXT: retq			; XOPAVX2-NEXT: retq
	;			;
	; X32-SSE-LABEL: sink_splatvar:			; X32-SSE-LABEL: sink_splatvar:
	; X32-SSE: # %bb.0: # %entry			; X32-SSE: # %bb.0: # %entry
	; X32-SSE-NEXT: pushl %esi			; X32-SSE-NEXT: pushl %esi
	; X32-SSE-NEXT: .cfi_def_cfa_offset 8			; X32-SSE-NEXT: .cfi_def_cfa_offset 8
	; X32-SSE-NEXT: .cfi_offset %esi, -8			; X32-SSE-NEXT: .cfi_offset %esi, -8
	; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; X32-SSE-NEXT: xorl %ecx, %ecx			; X32-SSE-NEXT: xorl %ecx, %ecx
	; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0			; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-SSE-NEXT: pslld $23, %xmm0			; X32-SSE-NEXT: movd %xmm0, %edx
	; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0			; X32-SSE-NEXT: andl $31, %edx
	; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0			; X32-SSE-NEXT: movl $32, %esi
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; X32-SSE-NEXT: subl %edx, %esi
				; X32-SSE-NEXT: movd %esi, %xmm0
				; X32-SSE-NEXT: movd %edx, %xmm1
	; X32-SSE-NEXT: xorl %edx, %edx			; X32-SSE-NEXT: xorl %edx, %edx
	; X32-SSE-NEXT: .p2align 4, 0x90			; X32-SSE-NEXT: .p2align 4, 0x90
	; X32-SSE-NEXT: .LBB8_1: # %loop			; X32-SSE-NEXT: .LBB8_1: # %loop
	; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1			; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1
	; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2			; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; X32-SSE-NEXT: movdqa %xmm2, %xmm3
	; X32-SSE-NEXT: pmuludq %xmm0, %xmm2			; X32-SSE-NEXT: psrld %xmm0, %xmm3
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]			; X32-SSE-NEXT: pslld %xmm1, %xmm2
	; X32-SSE-NEXT: pmuludq %xmm1, %xmm3			; X32-SSE-NEXT: por %xmm3, %xmm2
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
	; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
	; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X32-SSE-NEXT: por %xmm4, %xmm2
	; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)			; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)
	; X32-SSE-NEXT: addl $4, %ecx			; X32-SSE-NEXT: addl $4, %ecx
	; X32-SSE-NEXT: adcl $0, %edx			; X32-SSE-NEXT: adcl $0, %edx
	; X32-SSE-NEXT: movl %ecx, %esi			; X32-SSE-NEXT: movl %ecx, %esi
	; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100			; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100
	; X32-SSE-NEXT: orl %edx, %esi			; X32-SSE-NEXT: orl %edx, %esi
	; X32-SSE-NEXT: jne .LBB8_1			; X32-SSE-NEXT: jne .LBB8_1
	; X32-SSE-NEXT: # %bb.2: # %end			; X32-SSE-NEXT: # %bb.2: # %end
	▲ Show 20 Lines • Show All 932 Lines • Show Last 20 Lines

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

Show First 20 Lines • Show All 174 Lines • ▼ Show 20 Lines	if_true:
ret <2 x i64> %mask		ret <2 x i64> %mask

if_false:		if_false:
%res = lshr <2 x i64> %lhs, %mask		%res = lshr <2 x i64> %lhs, %mask
ret <2 x i64> %res		ret <2 x i64> %res
}		}

define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {		define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
; CHECK-LABEL: @funnel_splatvar(		; CHECK-SSE2-LABEL: @funnel_splatvar(
; CHECK-NEXT: entry:		; CHECK-SSE2-NEXT: entry:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0		; CHECK-SSE2-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer		; CHECK-SSE2-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]		; CHECK-SSE2: vector.body:
; CHECK: vector.body:		; CHECK-SSE2-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]		; CHECK-SSE2-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]		; CHECK-SSE2-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*		; CHECK-SSE2-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4		; CHECK-SSE2-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
; CHECK-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])		; CHECK-SSE2-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
; CHECK-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4		; CHECK-SSE2-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8		; CHECK-SSE2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536		; CHECK-SSE2-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]		; CHECK-SSE2-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:		; CHECK-SSE2: for.cond.cleanup:
; CHECK-NEXT: ret void		; CHECK-SSE2-NEXT: ret void
		;
		; CHECK-XOP-LABEL: @funnel_splatvar(
		; CHECK-XOP-NEXT: entry:
		; CHECK-XOP-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
		; CHECK-XOP-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
		; CHECK-XOP-NEXT: br label [[VECTOR_BODY:%.*]]
		; CHECK-XOP: vector.body:
		; CHECK-XOP-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
		; CHECK-XOP-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
		; CHECK-XOP-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
		; CHECK-XOP-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
		; CHECK-XOP-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
		; CHECK-XOP-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
		; CHECK-XOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
		; CHECK-XOP-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
		; CHECK-XOP-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
		; CHECK-XOP: for.cond.cleanup:
		; CHECK-XOP-NEXT: ret void
		;
		; CHECK-AVX-LABEL: @funnel_splatvar(
		spatelAuthorUnsubmitted Done Reply Inline Actions The labeling here is a bit misleading - "AVX" means both AVX2 and AVX512, but not AVX1; there is no AVX1 run line on this file. More specific testing is shown in the x86 codegen file. spatel: The labeling here is a bit misleading - "AVX" means both AVX2 and AVX512, but not AVX1; there…
		; CHECK-AVX-NEXT: entry:
		; CHECK-AVX-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
		; CHECK-AVX-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
		; CHECK-AVX-NEXT: br label [[VECTOR_BODY:%.*]]
		; CHECK-AVX: vector.body:
		; CHECK-AVX-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
		; CHECK-AVX-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
		; CHECK-AVX-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
		; CHECK-AVX-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
		; CHECK-AVX-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
		; CHECK-AVX-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
		; CHECK-AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
		; CHECK-AVX-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
		; CHECK-AVX-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
		; CHECK-AVX: for.cond.cleanup:
		; CHECK-AVX-NEXT: ret void
;		;
entry:		entry:
%broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0		%broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0
%broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer		%broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body		br label %vector.body

vector.body:		vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]		%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
Show All 14 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amount
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263181

llvm/lib/CodeGen/CodeGenPrepare.cpp

llvm/test/CodeGen/X86/vector-fshl-128.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

This is an archive of the discontinued LLVM Phabricator instance.

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amountClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263181

llvm/lib/CodeGen/CodeGenPrepare.cpp

llvm/test/CodeGen/X86/vector-fshl-128.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amount
ClosedPublic