Diff 263560

llvm/include/llvm/CodeGen/TargetLowering.h

Show First 20 Lines • Show All 2,330 Lines • ▼ Show 20 Lines	public:
/// store instruction.		/// store instruction.
virtual bool isLegalStoreImmediate(int64_t Value) const {		virtual bool isLegalStoreImmediate(int64_t Value) const {
// Default implementation assumes that at least 0 works since it is likely		// Default implementation assumes that at least 0 works since it is likely
// that a zero register exists or a zero immediate is allowed.		// that a zero register exists or a zero immediate is allowed.
return Value == 0;		return Value == 0;
}		}

/// Return true if it's significantly cheaper to shift a vector by a uniform		/// Return true if it's significantly cheaper to shift a vector by a uniform
/// scalar than by an amount which will vary across each lane. On x86, for		/// scalar than by an amount which will vary across each lane. On x86 before
/// example, there is a "psllw" instruction for the former case, but no simple		/// AVX2 for example, there is a "psllw" instruction for the former case, but
/// instruction for a general "a << b" operation on vectors.		/// no simple instruction for a general "a << b" operation on vectors.
		/// This should also apply to lowering for vector funnel shifts (rotates).
virtual bool isVectorShiftByScalarCheap(Type *Ty) const {		virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
return false;		return false;
}		}

/// Returns true if the opcode is a commutative binary operation.		/// Returns true if the opcode is a commutative binary operation.
virtual bool isCommutativeBinOp(unsigned Opcode) const {		virtual bool isCommutativeBinOp(unsigned Opcode) const {
// FIXME: This should get its info from the td file.		// FIXME: This should get its info from the td file.
switch (Opcode) {		switch (Opcode) {
▲ Show 20 Lines • Show All 2,103 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86ISelLowering.h

Show First 20 Lines • Show All 1,026 Lines • ▼ Show 20 Lines	public:
/// Return the cost of the scaling factor used in the addressing		/// Return the cost of the scaling factor used in the addressing
/// mode represented by AM for this target, for a load/store		/// mode represented by AM for this target, for a load/store
/// of the specified type.		/// of the specified type.
/// If the AM is supported, the return value must be >= 0.		/// If the AM is supported, the return value must be >= 0.
/// If the AM is not supported, it returns a negative value.		/// If the AM is not supported, it returns a negative value.
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,		int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;		unsigned AS) const override;

		/// This is used to enable splatted operand transforms for vector shifts
		/// and vector funnel shifts.
bool isVectorShiftByScalarCheap(Type *Ty) const override;		bool isVectorShiftByScalarCheap(Type *Ty) const override;

/// Add x86-specific opcodes to the default list.		/// Add x86-specific opcodes to the default list.
bool isBinOp(unsigned Opcode) const override;		bool isBinOp(unsigned Opcode) const override;

/// Returns true if the opcode is a commutative binary operation.		/// Returns true if the opcode is a commutative binary operation.
bool isCommutativeBinOp(unsigned Opcode) const override;		bool isCommutativeBinOp(unsigned Opcode) const override;

Show All 12 Lines	public:
/// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this		/// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
/// does not necessarily apply to truncate instructions. e.g. on x86-64,		/// does not necessarily apply to truncate instructions. e.g. on x86-64,
/// all instructions that define 32-bit values implicit zero-extend the		/// all instructions that define 32-bit values implicit zero-extend the
/// result out to 64 bits.		/// result out to 64 bits.
bool isZExtFree(Type Ty1, Type Ty2) const override;		bool isZExtFree(Type Ty1, Type Ty2) const override;
bool isZExtFree(EVT VT1, EVT VT2) const override;		bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;		bool isZExtFree(SDValue Val, EVT VT2) const override;

		bool shouldSinkOperands(Instruction *I,
		SmallVectorImpl<Use *> &Ops) const override;

/// Return true if folding a vector load into ExtVal (a sign, zero, or any		/// Return true if folding a vector load into ExtVal (a sign, zero, or any
/// extend node) is profitable.		/// extend node) is profitable.
bool isVectorLoadExtDesirable(SDValue) const override;		bool isVectorLoadExtDesirable(SDValue) const override;

/// Return true if an FMA operation is faster than a pair of fmul and fadd		/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this		/// instructions. fmuladd intrinsics will be expanded to FMAs when this
/// method returns true, otherwise fmuladd is expanded to fmul + fadd.		/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,		bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
▲ Show 20 Lines • Show All 503 Lines • Show Last 20 Lines

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 30,669 Lines • ▼ Show 20 Lines	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
case MVT::i32:		case MVT::i32:
// X86 has 8, 16, and 32-bit zero-extending loads.		// X86 has 8, 16, and 32-bit zero-extending loads.
return true;		return true;
}		}

return false;		return false;
}		}

		bool X86TargetLowering::shouldSinkOperands(Instruction *I,
		SmallVectorImpl<Use *> &Ops) const {
		// A uniform shift amount in a vector shift or funnel shift may be much
		// cheaper than a generic variable vector shift, so make that pattern visible
		// to SDAG by sinking the shuffle instruction next to the shift.
		// TODO: This should handle normal shift opcodes too.
		if (auto *II = dyn_cast<IntrinsicInst>(I)) {
		Intrinsic::ID ID = II->getIntrinsicID();
		if (ID == Intrinsic::fshl \|\| ID == Intrinsic::fshr) {
		// The shift amount operand for these intrinsics is operand 2.
		auto *Shuf = dyn_cast<ShuffleVectorInst>(II->getOperand(2));
		if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
		isVectorShiftByScalarCheap(I->getType())) {
		Ops.push_back(&I->getOperandUse(2));
		return true;
		}
		}
		}

		return false;
		}

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {		bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))		if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
return false;		return false;

EVT SrcVT = ExtVal.getOperand(0).getValueType();		EVT SrcVT = ExtVal.getOperand(0).getValueType();

// There is no extending load for vXi1.		// There is no extending load for vXi1.
if (SrcVT.getScalarType() == MVT::i1)		if (SrcVT.getScalarType() == MVT::i1)
▲ Show 20 Lines • Show All 18,270 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-fshl-128.ll

	Show First 20 Lines • Show All 2,137 Lines • ▼ Show 20 Lines

	; CGP should allow a cross-block splat shift amount to be seen in SDAG.			; CGP should allow a cross-block splat shift amount to be seen in SDAG.
	; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426			; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426

	define void @sink_splatvar(i32* %p, i32 %shift_amt) {			define void @sink_splatvar(i32* %p, i32 %shift_amt) {
	; SSE2-LABEL: sink_splatvar:			; SSE2-LABEL: sink_splatvar:
	; SSE2: # %bb.0: # %entry			; SSE2: # %bb.0: # %entry
	; SSE2-NEXT: movd %esi, %xmm0			; SSE2-NEXT: movd %esi, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00			; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
	; SSE2-NEXT: pand {{.*}}(%rip), %xmm0			; SSE2-NEXT: movd %xmm0, %ecx
	; SSE2-NEXT: pslld $23, %xmm0			; SSE2-NEXT: andl $31, %ecx
	; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0			; SSE2-NEXT: movl $32, %edx
	; SSE2-NEXT: cvttps2dq %xmm0, %xmm0			; SSE2-NEXT: subl %ecx, %edx
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE2-NEXT: movd %edx, %xmm0
				; SSE2-NEXT: movd %ecx, %xmm1
	; SSE2-NEXT: .p2align 4, 0x90			; SSE2-NEXT: .p2align 4, 0x90
	; SSE2-NEXT: .LBB8_1: # %loop			; SSE2-NEXT: .LBB8_1: # %loop
	; SSE2-NEXT: # =>This Inner Loop Header: Depth=1			; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
	; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2			; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; SSE2-NEXT: movdqa %xmm2, %xmm3
	; SSE2-NEXT: pmuludq %xmm0, %xmm2			; SSE2-NEXT: psrld %xmm0, %xmm3
	; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]			; SSE2-NEXT: pslld %xmm1, %xmm2
	; SSE2-NEXT: pmuludq %xmm1, %xmm3			; SSE2-NEXT: por %xmm3, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
	; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
	; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; SSE2-NEXT: por %xmm4, %xmm2
	; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)			; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
	; SSE2-NEXT: addq $16, %rax			; SSE2-NEXT: addq $16, %rax
	; SSE2-NEXT: jne .LBB8_1			; SSE2-NEXT: jne .LBB8_1
	; SSE2-NEXT: # %bb.2: # %end			; SSE2-NEXT: # %bb.2: # %end
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: sink_splatvar:			; SSE41-LABEL: sink_splatvar:
	; SSE41: # %bb.0: # %entry			; SSE41: # %bb.0: # %entry
	; SSE41-NEXT: movd %esi, %xmm0			; SSE41-NEXT: movd %esi, %xmm0
	; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00			; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00
	; SSE41-NEXT: pand {{.*}}(%rip), %xmm0			; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
	; SSE41-NEXT: pslld $23, %xmm0			; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
	; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0			; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
	; SSE41-NEXT: cvttps2dq %xmm0, %xmm0			; SSE41-NEXT: psubd %xmm1, %xmm0
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
				; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
	; SSE41-NEXT: .p2align 4, 0x90			; SSE41-NEXT: .p2align 4, 0x90
	; SSE41-NEXT: .LBB8_1: # %loop			; SSE41-NEXT: .LBB8_1: # %loop
	; SSE41-NEXT: # =>This Inner Loop Header: Depth=1			; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
	; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2			; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2
	; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; SSE41-NEXT: movdqa %xmm2, %xmm3
	; SSE41-NEXT: pmuludq %xmm1, %xmm3			; SSE41-NEXT: psrld %xmm0, %xmm3
	; SSE41-NEXT: pmuludq %xmm0, %xmm2			; SSE41-NEXT: pslld %xmm1, %xmm2
	; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]			; SSE41-NEXT: por %xmm3, %xmm2
	; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]			; SSE41-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
	; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
	; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
	; SSE41-NEXT: por %xmm4, %xmm3
	; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax)
	; SSE41-NEXT: addq $16, %rax			; SSE41-NEXT: addq $16, %rax
	; SSE41-NEXT: jne .LBB8_1			; SSE41-NEXT: jne .LBB8_1
	; SSE41-NEXT: # %bb.2: # %end			; SSE41-NEXT: # %bb.2: # %end
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX1-LABEL: sink_splatvar:			; AVX1-LABEL: sink_splatvar:
	; AVX1: # %bb.0: # %entry			; AVX1: # %bb.0: # %entry
	; AVX1-NEXT: vmovd %esi, %xmm0			; AVX1-NEXT: vmovd %esi, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00			; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
	; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0			; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; AVX1-NEXT: vpslld $23, %xmm0, %xmm0			; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
	; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0			; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [32,32,32,32]
	; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0			; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
				; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
				craig.topperUnsubmitted Not Done Reply Inline Actions Why are we splatting the scalar here when only element 0 is used? craig.topper: Why are we splatting the scalar here when only element 0 is used?
				spatelAuthorUnsubmitted Done Reply Inline Actions This is another limitation caused by the block-level visibility - SDAG doesn't know that the splat is from a scalar because we are only sinking the shuffle instruction, not the insertelement: t12: v4i32,ch = CopyFromReg t0, Register:v4i32 %0 t14: v4i32 = vector_shuffle<0,0,0,0> t12, undef:v4i32 The splat doesn't get hoisted back out of the loop until later in MachineLICM, and there's apparently no really late analysis for demanded elements. We could try to sink insertelement to shuffles. That should probably be another patch though. spatel: This is another limitation caused by the block-level visibility - SDAG doesn't know that the…
				craig.topperUnsubmitted Not Done Reply Inline Actions I'm still confused. Shouldn't demandedelts inside selectiondag have determined the splat shuffle was unnecessary regardless of it coming from an insertelement? craig.topper: I'm still confused. Shouldn't demandedelts inside selectiondag have determined the splat…
				spatelAuthorUnsubmitted Done Reply Inline Actions Ah, I see. Starting from the x86 shift nodes, we should see that we only need the low chunk. I didn't step through, but there are many potential candidates here that would foil the analysis: too many intervening nodes, casts to different sizes, and/or multiple uses: t14: v4i32 = vector_shuffle<0,0,0,0> t12, undef:v4i32 t45: v4i32 = BUILD_VECTOR Constant:i32<31>, Constant:i32<31>, Constant:i32<31>, Constant:i32<31> t46: v4i32 = and t14, t45 t25: ch = CopyToReg t0, Register:i64 %2, t23 t54: v2i64 = zero_extend_vector_inreg t46 t55: v4i32 = bitcast t54 t56: v4i32 = X86ISD::VSHL t10, t55 t48: v4i32 = BUILD_VECTOR Constant:i32<32>, Constant:i32<32>, Constant:i32<32>, Constant:i32<32> t49: v4i32 = sub t48, t46 t58: v2i64 = zero_extend_vector_inreg t49 t59: v4i32 = bitcast t58 t60: v4i32 = X86ISD::VSRL t10, t59 spatel: Ah, I see. Starting from the x86 shift nodes, we should see that we only need the low chunk. I…
	; AVX1-NEXT: .p2align 4, 0x90			; AVX1-NEXT: .p2align 4, 0x90
	; AVX1-NEXT: .LBB8_1: # %loop			; AVX1-NEXT: .LBB8_1: # %loop
	; AVX1-NEXT: # =>This Inner Loop Header: Depth=1			; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
	; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2			; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; AVX1-NEXT: vpsrld %xmm0, %xmm2, %xmm3
	; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3			; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
	; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2			; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
	; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
	; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
	; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
	; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
	; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)			; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
	; AVX1-NEXT: addq $16, %rax			; AVX1-NEXT: addq $16, %rax
	; AVX1-NEXT: jne .LBB8_1			; AVX1-NEXT: jne .LBB8_1
	; AVX1-NEXT: # %bb.2: # %end			; AVX1-NEXT: # %bb.2: # %end
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: sink_splatvar:			; AVX2-LABEL: sink_splatvar:
	; AVX2: # %bb.0: # %entry			; AVX2: # %bb.0: # %entry
	▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
	; XOPAVX2-NEXT: retq			; XOPAVX2-NEXT: retq
	;			;
	; X32-SSE-LABEL: sink_splatvar:			; X32-SSE-LABEL: sink_splatvar:
	; X32-SSE: # %bb.0: # %entry			; X32-SSE: # %bb.0: # %entry
	; X32-SSE-NEXT: pushl %esi			; X32-SSE-NEXT: pushl %esi
	; X32-SSE-NEXT: .cfi_def_cfa_offset 8			; X32-SSE-NEXT: .cfi_def_cfa_offset 8
	; X32-SSE-NEXT: .cfi_offset %esi, -8			; X32-SSE-NEXT: .cfi_offset %esi, -8
	; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax			; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
	; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
	; X32-SSE-NEXT: xorl %ecx, %ecx			; X32-SSE-NEXT: xorl %ecx, %ecx
	; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0			; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
	; X32-SSE-NEXT: pslld $23, %xmm0			; X32-SSE-NEXT: movd %xmm0, %edx
	; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0			; X32-SSE-NEXT: andl $31, %edx
	; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0			; X32-SSE-NEXT: movl $32, %esi
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]			; X32-SSE-NEXT: subl %edx, %esi
				; X32-SSE-NEXT: movd %esi, %xmm0
				; X32-SSE-NEXT: movd %edx, %xmm1
	; X32-SSE-NEXT: xorl %edx, %edx			; X32-SSE-NEXT: xorl %edx, %edx
	; X32-SSE-NEXT: .p2align 4, 0x90			; X32-SSE-NEXT: .p2align 4, 0x90
	; X32-SSE-NEXT: .LBB8_1: # %loop			; X32-SSE-NEXT: .LBB8_1: # %loop
	; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1			; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1
	; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2			; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]			; X32-SSE-NEXT: movdqa %xmm2, %xmm3
	; X32-SSE-NEXT: pmuludq %xmm0, %xmm2			; X32-SSE-NEXT: psrld %xmm0, %xmm3
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]			; X32-SSE-NEXT: pslld %xmm1, %xmm2
	; X32-SSE-NEXT: pmuludq %xmm1, %xmm3			; X32-SSE-NEXT: por %xmm3, %xmm2
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
	; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
	; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
	; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
	; X32-SSE-NEXT: por %xmm4, %xmm2
	; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)			; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)
	; X32-SSE-NEXT: addl $4, %ecx			; X32-SSE-NEXT: addl $4, %ecx
	; X32-SSE-NEXT: adcl $0, %edx			; X32-SSE-NEXT: adcl $0, %edx
	; X32-SSE-NEXT: movl %ecx, %esi			; X32-SSE-NEXT: movl %ecx, %esi
	; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100			; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100
	; X32-SSE-NEXT: orl %edx, %esi			; X32-SSE-NEXT: orl %edx, %esi
	; X32-SSE-NEXT: jne .LBB8_1			; X32-SSE-NEXT: jne .LBB8_1
	; X32-SSE-NEXT: # %bb.2: # %end			; X32-SSE-NEXT: # %bb.2: # %end
	▲ Show 20 Lines • Show All 932 Lines • Show Last 20 Lines

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

Show First 20 Lines • Show All 174 Lines • ▼ Show 20 Lines	if_true:
ret <2 x i64> %mask		ret <2 x i64> %mask

if_false:		if_false:
%res = lshr <2 x i64> %lhs, %mask		%res = lshr <2 x i64> %lhs, %mask
ret <2 x i64> %res		ret <2 x i64> %res
}		}

define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {		define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
; CHECK-LABEL: @funnel_splatvar(		; CHECK-SSE2-LABEL: @funnel_splatvar(
; CHECK-NEXT: entry:		; CHECK-SSE2-NEXT: entry:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0		; CHECK-SSE2-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer		; CHECK-SSE2-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]		; CHECK-SSE2: vector.body:
; CHECK: vector.body:		; CHECK-SSE2-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]		; CHECK-SSE2-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]		; CHECK-SSE2-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
; CHECK-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*		; CHECK-SSE2-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
; CHECK-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4		; CHECK-SSE2-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])		; CHECK-SSE2-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
; CHECK-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4		; CHECK-SSE2-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8		; CHECK-SSE2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536		; CHECK-SSE2-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]		; CHECK-SSE2-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:		; CHECK-SSE2: for.cond.cleanup:
; CHECK-NEXT: ret void		; CHECK-SSE2-NEXT: ret void
		;
		; CHECK-XOP-LABEL: @funnel_splatvar(
		; CHECK-XOP-NEXT: entry:
		; CHECK-XOP-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
		; CHECK-XOP-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
		; CHECK-XOP-NEXT: br label [[VECTOR_BODY:%.*]]
		; CHECK-XOP: vector.body:
		; CHECK-XOP-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
		; CHECK-XOP-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
		; CHECK-XOP-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
		; CHECK-XOP-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
		; CHECK-XOP-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
		; CHECK-XOP-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
		; CHECK-XOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
		; CHECK-XOP-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
		; CHECK-XOP-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
		; CHECK-XOP: for.cond.cleanup:
		; CHECK-XOP-NEXT: ret void
		;
		; CHECK-AVX-LABEL: @funnel_splatvar(
		spatelAuthorUnsubmitted Done Reply Inline Actions The labeling here is a bit misleading - "AVX" means both AVX2 and AVX512, but not AVX1; there is no AVX1 run line on this file. More specific testing is shown in the x86 codegen file. spatel: The labeling here is a bit misleading - "AVX" means both AVX2 and AVX512, but not AVX1; there…
		; CHECK-AVX-NEXT: entry:
		; CHECK-AVX-NEXT: [[BROADCAST_SPLATINSERT15:%.]] = insertelement <8 x i32> undef, i32 [[ROT:%.]], i32 0
		; CHECK-AVX-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
		; CHECK-AVX-NEXT: br label [[VECTOR_BODY:%.*]]
		; CHECK-AVX: vector.body:
		; CHECK-AVX-NEXT: [[INDEX:%.]] = phi i64 [ 0, [[ENTRY:%.]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
		; CHECK-AVX-NEXT: [[T0:%.]] = getelementptr inbounds i32, i32 [[ARR:%.*]], i64 [[INDEX]]
		; CHECK-AVX-NEXT: [[T1:%.]] = bitcast i32 [[T0]] to <8 x i32>*
		; CHECK-AVX-NEXT: [[WIDE_LOAD:%.]] = load <8 x i32>, <8 x i32> [[T1]], align 4
		; CHECK-AVX-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
		; CHECK-AVX-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
		; CHECK-AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
		; CHECK-AVX-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
		; CHECK-AVX-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
		; CHECK-AVX: for.cond.cleanup:
		; CHECK-AVX-NEXT: ret void
;		;
entry:		entry:
%broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0		%broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0
%broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer		%broadcast.splat16 = shufflevector <8 x i32> %broadcast.splatinsert15, <8 x i32> undef, <8 x i32> zeroinitializer
br label %vector.body		br label %vector.body

vector.body:		vector.body:
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]		%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
Show All 14 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amount
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263560

llvm/include/llvm/CodeGen/TargetLowering.h

llvm/lib/Target/X86/X86ISelLowering.h

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/vector-fshl-128.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

This is an archive of the discontinued LLVM Phabricator instance.

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amountClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 263560

llvm/include/llvm/CodeGen/TargetLowering.h

llvm/lib/Target/X86/X86ISelLowering.h

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/vector-fshl-128.ll

llvm/test/Transforms/CodeGenPrepare/X86/x86-shuffle-sink.ll

[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amount
ClosedPublic