This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] truncate left shift
ClosedPublic

Authored by rampitec on Jun 27 2017, 4:15 PM.

Download Raw Diff

Details

Reviewers

arsenm
msearles
vpykhtin

Commits

rGeb40733bf07e: Allow to truncate left shift with non-constant shift amount
rL306499: Allow to truncate left shift with non-constant shift amount

Summary

That is pretty common for clang to produce code like
(shl %x, (and %amt, 31)). In this situation we can still perform
trunc (shl) into shl (trunc) conversion given the known value
range of shift amount.

Diff Detail

Event Timeline

rampitec created this revision.Jun 27 2017, 4:15 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptJun 27 2017, 4:15 PM

arsenm added inline comments.Jun 27 2017, 5:06 PM

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2761–2762	This exact combine is already done in DAGCombiner: // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() Why isn't it triggering? Is the other combine you added missing an AddToWorklist or something?

rampitec added inline comments.Jun 27 2017, 5:08 PM

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2761–2762	Ah. That one only works with constants.

rampitec mentioned this in D34729: [AMDGPU] Add pattern for v_alignbit_b32 with immediate.Jun 27 2017, 5:15 PM

rampitec added inline comments.

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2761–2762	Do you want me to transfer it there?

arsenm added inline comments.Jun 27 2017, 5:21 PM

lib/Target/AMDGPU/AMDGPUISelLowering.cpp
2761–2762	Yes, it would be better if it was all in one place

Moved implementation into DAGCombiner and replaced existing DAGCombiner's optimization which was only capable of dealing with constant shift amounts.

arsenm added inline comments.Jun 27 2017, 6:56 PM

lib/CodeGen/SelectionDAG/DAGCombiner.cpp
8215 ↗	(On Diff #104329)	Will this regress the constant vector case?

rampitec added inline comments.Jun 27 2017, 7:10 PM

lib/CodeGen/SelectionDAG/DAGCombiner.cpp
8215 ↗	(On Diff #104329)	No.

LGTM

This revision is now accepted and ready to land.Jun 27 2017, 7:14 PM

Added vector by constant vector testcase.

The vector case won't matter for us, but maybe for other targets with legal vector operations

Closed by commit rL306499: Allow to truncate left shift with non-constant shift amount (authored by rampitec). · Explain WhyJun 27 2017, 7:37 PM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelLowering.h

1 line

AMDGPUISelLowering.cpp

27 lines

test/

CodeGen/

AMDGPU/

alignbit-pat.ll

69 lines

shift-i64-opts.ll

59 lines

Diff 104306

lib/Target/AMDGPU/AMDGPUISelLowering.h

Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	protected:
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;

SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,		SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,		unsigned Opc, SDValue LHS,
uint32_t ValLo, uint32_t ValHi) const;		uint32_t ValLo, uint32_t ValHi) const;
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
		SDValue performTruncCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,		SDValue performCtlzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;		SDValue RHS, DAGCombinerInfo &DCI) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
▲ Show 20 Lines • Show All 328 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 557 Lines • ▼ Show 20 Lines	AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemcpy = 0xffffffff;		MaxStoresPerMemcpy = 0xffffffff;
MaxStoresPerMemmove = 0xffffffff;		MaxStoresPerMemmove = 0xffffffff;
MaxStoresPerMemset = 0xffffffff;		MaxStoresPerMemset = 0xffffffff;

setTargetDAGCombine(ISD::BITCAST);		setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::SHL);		setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);		setTargetDAGCombine(ISD::SRA);
setTargetDAGCombine(ISD::SRL);		setTargetDAGCombine(ISD::SRL);
		setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::MUL);		setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::MULHU);		setTargetDAGCombine(ISD::MULHU);
setTargetDAGCombine(ISD::MULHS);		setTargetDAGCombine(ISD::MULHS);
setTargetDAGCombine(ISD::SELECT);		setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::SELECT_CC);		setTargetDAGCombine(ISD::SELECT_CC);
setTargetDAGCombine(ISD::STORE);		setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::FADD);		setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);		setTargetDAGCombine(ISD::FSUB);
▲ Show 20 Lines • Show All 2,178 Lines • ▼ Show 20 Lines	SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);		SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);		SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);

SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});		SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});

return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);		return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
}		}

		SDValue AMDGPUTargetLowering::performTruncCombine(SDNode *N,
		DAGCombinerInfo &DCI) const {
		arsenmUnsubmitted Done Reply Inline Actions This exact combine is already done in DAGCombiner: // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() Why isn't it triggering? Is the other combine you added missing an AddToWorklist or something? arsenm: This exact combine is already done in DAGCombiner: // trunc (shl x, K) -> shl (trunc x), K…
		rampitecAuthorUnsubmitted Done Reply Inline Actions Ah. That one only works with constants. rampitec: Ah. That one only works with constants.
		rampitecAuthorUnsubmitted Done Reply Inline Actions Do you want me to transfer it there? rampitec: Do you want me to transfer it there?
		arsenmUnsubmitted Done Reply Inline Actions Yes, it would be better if it was all in one place arsenm: Yes, it would be better if it was all in one place
		EVT VT = N->getValueType(0);
		if (VT != MVT::i32)
		return SDValue();

		SelectionDAG &DAG = DCI.DAG;
		SDValue Op = N->getOperand(0);

		// trunc i64 (shl x, y) to i32 -> i32 (shl (trunc x to i32), y) \| y < 32
		if (Op.getOpcode() != ISD::SHL \|\| Op.getValueType() != MVT::i64 \|\|
		!Op.hasOneUse())
		return SDValue();

		auto RHS = Op.getOperand(1);
		KnownBits Known;
		DAG.computeKnownBits(RHS, Known);
		if (Known.getBitWidth() - Known.countMinLeadingZeros() > 5)
		return SDValue();

		SDValue Lo = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op.getOperand(0));
		return DAG.getNode(ISD::SHL, SDLoc(Op), MVT::i32, Lo, RHS);
		}

// We need to specifically handle i64 mul here to avoid unnecessary conversion		// We need to specifically handle i64 mul here to avoid unnecessary conversion
// instructions. If we only match on the legalized i64 mul expansion,		// instructions. If we only match on the legalized i64 mul expansion,
// SimplifyDemandedBits will be unable to remove them because there will be		// SimplifyDemandedBits will be unable to remove them because there will be
// multiple uses due to the separate mul + mulh[su].		// multiple uses due to the separate mul + mulh[su].
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,		static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
SDValue N0, SDValue N1, unsigned Size, bool Signed) {		SDValue N0, SDValue N1, unsigned Size, bool Signed) {
if (Size <= 32) {		if (Size <= 32) {
unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;		unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
▲ Show 20 Lines • Show All 621 Lines • ▼ Show 20 Lines	case ISD::SRL: {
return performSrlCombine(N, DCI);		return performSrlCombine(N, DCI);
}		}
case ISD::SRA: {		case ISD::SRA: {
if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)		if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
break;		break;

return performSraCombine(N, DCI);		return performSraCombine(N, DCI);
}		}
		case ISD::TRUNCATE:
		return performTruncCombine(N, DCI);
case ISD::MUL:		case ISD::MUL:
return performMulCombine(N, DCI);		return performMulCombine(N, DCI);
case ISD::MULHS:		case ISD::MULHS:
return performMulhsCombine(N, DCI);		return performMulhsCombine(N, DCI);
case ISD::MULHU:		case ISD::MULHU:
return performMulhuCombine(N, DCI);		return performMulhuCombine(N, DCI);
case AMDGPUISD::MUL_I24:		case AMDGPUISD::MUL_I24:
case AMDGPUISD::MUL_U24:		case AMDGPUISD::MUL_U24:
▲ Show 20 Lines • Show All 388 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/alignbit-pat.ll

Show All 10 Lines	bb:
%tmp3 = and i32 %arg2, 31		%tmp3 = and i32 %arg2, 31
%tmp4 = zext i32 %tmp3 to i64		%tmp4 = zext i32 %tmp3 to i64
%tmp5 = lshr i64 %tmp, %tmp4		%tmp5 = lshr i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32		%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4		store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}alignbit_shl_pat:
; GCN-DAG: s_load_dword s[[SHL:[0-9]+]]
; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN-DAG: s_sub_i32 s[[SHR:[0-9]+]], 32, s[[SHL]]
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], s[[SHR]]

define amdgpu_kernel void @alignbit_shl_pat(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
bb:
%tmp = load i64, i64 addrspace(1)* %arg, align 8
%tmp3 = and i32 %arg2, 31
%tmp4 = zext i32 %tmp3 to i64
%tmp5 = shl i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void
}

; GCN-LABEL: {{^}}alignbit_shr_pat_v:		; GCN-LABEL: {{^}}alignbit_shr_pat_v:
; GCN-DAG: load_dword v[[SHR:[0-9]+]],		; GCN-DAG: load_dword v[[SHR:[0-9]+]],
; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}		; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]		; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]

define amdgpu_kernel void @alignbit_shr_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {		define amdgpu_kernel void @alignbit_shr_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
bb:		bb:
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()		%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid		%gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
%tmp = load i64, i64 addrspace(1)* %gep1, align 8		%tmp = load i64, i64 addrspace(1)* %gep1, align 8
%gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid		%gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid
%amt = load i32, i32 addrspace(1)* %gep2, align 4		%amt = load i32, i32 addrspace(1)* %gep2, align 4
%tmp3 = and i32 %amt, 31		%tmp3 = and i32 %amt, 31
%tmp4 = zext i32 %tmp3 to i64		%tmp4 = zext i32 %tmp3 to i64
%tmp5 = lshr i64 %tmp, %tmp4		%tmp5 = lshr i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32		%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %gep2, align 4		store i32 %tmp6, i32 addrspace(1)* %gep2, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}alignbit_shl_pat_v:
; GCN-DAG: load_dword v[[SHL:[0-9]+]],
; GCN-DAG: load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN-DAG: v_sub_i32_e32 v[[SHR:[0-9]+]], {{[^,]+}}, 32, v[[SHL]]
; GCN: v_alignbit_b32 v{{[0-9]+}}, v[[HI]], v[[LO]], v[[SHR]]

define amdgpu_kernel void @alignbit_shl_pat_v(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1) {
bb:
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep1 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i32 %tid
%tmp = load i64, i64 addrspace(1)* %gep1, align 8
%gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tid
%amt = load i32, i32 addrspace(1)* %gep2, align 4
%tmp3 = and i32 %amt, 31
%tmp4 = zext i32 %tmp3 to i64
%tmp5 = shl i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %gep2, align 4
ret void
}

; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30:		; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and30:
; Negative test, wrong constant		; Negative test, wrong constant
; GCN: v_lshr_b64		; GCN: v_lshr_b64
; GCN-NOT: v_alignbit_b32		; GCN-NOT: v_alignbit_b32

define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {		define amdgpu_kernel void @alignbit_shr_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
bb:		bb:
%tmp = load i64, i64 addrspace(1)* %arg, align 8		%tmp = load i64, i64 addrspace(1)* %arg, align 8
%tmp3 = and i32 %arg2, 30		%tmp3 = and i32 %arg2, 30
%tmp4 = zext i32 %tmp3 to i64		%tmp4 = zext i32 %tmp3 to i64
%tmp5 = lshr i64 %tmp, %tmp4		%tmp5 = lshr i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32		%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4		store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and30:
; Negative test, wrong constant
; GCN: v_lshl_b64
; GCN-NOT: v_alignbit_b32

define amdgpu_kernel void @alignbit_shl_pat_wrong_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
bb:
%tmp = load i64, i64 addrspace(1)* %arg, align 8
%tmp3 = and i32 %arg2, 30
%tmp4 = zext i32 %tmp3 to i64
%tmp5 = shl i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void
}

; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63:		; GCN-LABEL: {{^}}alignbit_shr_pat_wrong_and63:
; Negative test, wrong constant		; Negative test, wrong constant
; GCN: v_lshr_b64		; GCN: v_lshr_b64
; GCN-NOT: v_alignbit_b32		; GCN-NOT: v_alignbit_b32

define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {		define amdgpu_kernel void @alignbit_shr_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
bb:		bb:
%tmp = load i64, i64 addrspace(1)* %arg, align 8		%tmp = load i64, i64 addrspace(1)* %arg, align 8
%tmp3 = and i32 %arg2, 63		%tmp3 = and i32 %arg2, 63
%tmp4 = zext i32 %tmp3 to i64		%tmp4 = zext i32 %tmp3 to i64
%tmp5 = lshr i64 %tmp, %tmp4		%tmp5 = lshr i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32		%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4		store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}alignbit_shl_pat_wrong_and63:
; Negative test, wrong constant
; GCN: v_lshl_b64
; GCN-NOT: v_alignbit_b32

define amdgpu_kernel void @alignbit_shl_pat_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
bb:
%tmp = load i64, i64 addrspace(1)* %arg, align 8
%tmp3 = and i32 %arg2, 63
%tmp4 = zext i32 %tmp3 to i64
%tmp5 = shl i64 %tmp, %tmp4
%tmp6 = trunc i64 %tmp5 to i32
store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0		declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone speculatable }		attributes #0 = { nounwind readnone speculatable }

test/CodeGen/AMDGPU/shift-i64-opts.ll

	Show First 20 Lines • Show All 237 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {			define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
	%val = load i64, i64 addrspace(1)* %in			%val = load i64, i64 addrspace(1)* %in
	%shl = shl i64 %val, 31			%shl = shl i64 %val, 31
	%trunc = trunc i64 %shl to i32			%trunc = trunc i64 %shl to i32
	store volatile i32 %trunc, i32 addrspace(1)* %out			store volatile i32 %trunc, i32 addrspace(1)* %out
	store volatile i64 %shl, i64 addrspace(1)* %in			store volatile i64 %shl, i64 addrspace(1)* %in
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}trunc_shl_and31:
				; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 31
				; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}}
				; GCN-NOT: v_lshl_b64
				; GCN-NOT: v_lshlrev_b64
				define amdgpu_kernel void @trunc_shl_and31(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
				bb:
				%tmp = load i64, i64 addrspace(1)* %arg, align 8
				%tmp3 = and i32 %arg2, 31
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = shl i64 %tmp, %tmp4
				%tmp6 = trunc i64 %tmp5 to i32
				store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
				ret void
				}

				; GCN-LABEL: {{^}}trunc_shl_and30:
				; GCN: s_and_b32 s[[AMT:[0-9]+]], s{{[0-9]+}}, 30
				; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, s[[AMT]], v{{[0-9]+}}
				; GCN-NOT: v_lshl_b64
				; GCN-NOT: v_lshlrev_b64
				define amdgpu_kernel void @trunc_shl_and30(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
				bb:
				%tmp = load i64, i64 addrspace(1)* %arg, align 8
				%tmp3 = and i32 %arg2, 30
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = shl i64 %tmp, %tmp4
				%tmp6 = trunc i64 %tmp5 to i32
				store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
				ret void
				}

				; GCN-LABEL: {{^}}trunc_shl_wrong_and63:
				; Negative test, wrong constant
				; GCN: v_lshl_b64
				define amdgpu_kernel void @trunc_shl_wrong_and63(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
				bb:
				%tmp = load i64, i64 addrspace(1)* %arg, align 8
				%tmp3 = and i32 %arg2, 63
				%tmp4 = zext i32 %tmp3 to i64
				%tmp5 = shl i64 %tmp, %tmp4
				%tmp6 = trunc i64 %tmp5 to i32
				store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
				ret void
				}

				; GCN-LABEL: {{^}}trunc_shl_no_and:
				; Negative test, shift can be full 64 bit
				; GCN: v_lshl_b64
				define amdgpu_kernel void @trunc_shl_no_and(i64 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) {
				bb:
				%tmp = load i64, i64 addrspace(1)* %arg, align 8
				%tmp4 = zext i32 %arg2 to i64
				%tmp5 = shl i64 %tmp, %tmp4
				%tmp6 = trunc i64 %tmp5 to i32
				store i32 %tmp6, i32 addrspace(1)* %arg1, align 4
				ret void
				}