Diff 151159

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 101 Lines • ▼ Show 20 Lines
private:		private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;		std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;		bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;		bool isInlineImmediate(const SDNode *N) const;

bool isConstantLoad(const MemSDNode *N, int cbID) const;		bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool isUniformBr(const SDNode *N) const;		bool isUniformBr(const SDNode *N) const;

		MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;

SDNode glueCopyToM0(SDNode N) const;		SDNode glueCopyToM0(SDNode N) const;

const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;		const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);		bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,		bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
SDValue& Offset);		SDValue& Offset);
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
▲ Show 20 Lines • Show All 240 Lines • ▼ Show 20 Lines	SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N) const {
SmallVector <SDValue, 8> Ops;		SmallVector <SDValue, 8> Ops;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {		for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
Ops.push_back(N->getOperand(i));		Ops.push_back(N->getOperand(i));
}		}
Ops.push_back(Glue);		Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);		return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}		}

		MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(
		SDLoc &DL, uint64_t Imm, EVT VT) const {
		SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
		CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
		MVT::i32));
		SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
		CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
		nhaehnleUnsubmitted Done Reply Inline Actions Have you run clang-format on this? It looks a bit off. nhaehnle: Have you run clang-format on this? It looks a bit off.
		const SDValue Ops[] = {
		CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
		SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
		SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
		};

		return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
		}

static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {		static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {		switch (NumVectorElts) {
case 1:		case 1:
return AMDGPU::SReg_32_XM0RegClassID;		return AMDGPU::SReg_32_XM0RegClassID;
case 2:		case 2:
return AMDGPU::SReg_64RegClassID;		return AMDGPU::SReg_64RegClassID;
case 4:		case 4:
return AMDGPU::SReg_128RegClassID;		return AMDGPU::SReg_128RegClassID;
▲ Show 20 Lines • Show All 170 Lines • ▼ Show 20 Lines	case ISD::ConstantFP: {
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))		if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();		Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
else {		else {
ConstantSDNode *C = cast<ConstantSDNode>(N);		ConstantSDNode *C = cast<ConstantSDNode>(N);
Imm = C->getZExtValue();		Imm = C->getZExtValue();
}		}

SDLoc DL(N);		SDLoc DL(N);
SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,		ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
MVT::i32));
SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
};

ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops));
return;		return;
}		}
case ISD::LOAD:		case ISD::LOAD:
case ISD::STORE: {		case ISD::STORE: {
N = glueCopyToM0(N);		N = glueCopyToM0(N);
break;		break;
}		}

▲ Show 20 Lines • Show All 419 Lines • ▼ Show 20 Lines	bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
// FIXME: This is broken on SI where we still need to check if the base		// FIXME: This is broken on SI where we still need to check if the base
// pointer is positive here.		// pointer is positive here.
Base = Addr;		Base = Addr;
Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);		Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);		Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
return true;		return true;
}		}


		nhaehnleUnsubmitted Done Reply Inline Actions Spurious whitespace change. nhaehnle: Spurious whitespace change.
bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,		bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &VAddr, SDValue &SOffset,		SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &Offen,		SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,		SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,		SDValue &GLC, SDValue &SLC,
SDValue &TFE) const {		SDValue &TFE) const {
		arsenmUnsubmitted Not Done Reply Inline Actions SReg_64_XEXEC arsenm: SReg_64_XEXEC
		arsenmUnsubmitted Done Reply Inline Actions Probably should also create a helper that the constant lowering can use as well since this is basically the same arsenm: Probably should also create a helper that the constant lowering can use as well since this is…
		tprAuthorUnsubmitted Not Done Reply Inline Actions I left the SGPR_64RegClassID as that's what the existing code for a 64 bit constant load has. I have done the suggested helper func. tpr: I left the SGPR_64RegClassID as that's what the existing code for a 64 bit constant load has. I…
// Subtarget prefers to use flat instruction		// Subtarget prefers to use flat instruction
if (Subtarget->useFlatForGlobal())		if (Subtarget->useFlatForGlobal())
return false;		return false;

SDLoc DL(Addr);		SDLoc DL(Addr);

if (!GLC.getNode())		if (!GLC.getNode())
GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);		GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
Show All 11 Lines	if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N1 = Addr.getOperand(1);		SDValue N1 = Addr.getOperand(1);
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);		ConstantSDNode *C1 = cast<ConstantSDNode>(N1);

if (N0.getOpcode() == ISD::ADD) {		if (N0.getOpcode() == ISD::ADD) {
// (add (add N2, N3), C1) -> addr64		// (add (add N2, N3), C1) -> addr64
SDValue N2 = N0.getOperand(0);		SDValue N2 = N0.getOperand(0);
SDValue N3 = N0.getOperand(1);		SDValue N3 = N0.getOperand(1);
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);		Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
		if (N2->isDivergent()) {
		if (N3->isDivergent()) {
		// Both N2 and N3 are divergent. Keep the add and use N2+N3 as the
		// vaddr, and construct the resource out of 0.
		Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
		VAddr = N0;
		} else {
		// N2 is divergent, N3 is not.
		Ptr = N3;
		VAddr = N2;
		}
		} else {
		// N2 is not divergent.
Ptr = N2;		Ptr = N2;
VAddr = N3;		VAddr = N3;
		}
} else {		} else {
// (add N0, C1) -> offset		// (add N0, C1) -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);		VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = N0;		Ptr = N0;
}		}

if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {		if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);		Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
Show All 10 Lines	if (CurDAG->isBaseWithConstantOffset(Addr)) {
}		}
}		}

if (Addr.getOpcode() == ISD::ADD) {		if (Addr.getOpcode() == ISD::ADD) {
// (add N0, N1) -> addr64		// (add N0, N1) -> addr64
SDValue N0 = Addr.getOperand(0);		SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);		SDValue N1 = Addr.getOperand(1);
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);		Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);

		if (N0->isDivergent()) {
		if (N1->isDivergent()) {
		// Both N0 and N1 are divergent. Use the result of the add as the
		// addr64, and construct the resource from a 0 address.
		Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
		VAddr = Addr;
		} else {
		// N0 is divergent, N1 is not.
		Ptr = N1;
		VAddr = N0;
		}
		} else {
		// N0 is not divergent.
Ptr = N0;		Ptr = N0;
VAddr = N1;		VAddr = N1;
		}
		nhaehnleUnsubmitted Done Reply Inline Actions This has a lot of redundancy with the isBaseWithConstantOffset case above. Perhaps the cases can be combined? nhaehnle: This has a lot of redundancy with the isBaseWithConstantOffset case above. Perhaps the cases…
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);		Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;		return true;
}		}

// default case -> offset		// default case -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);		VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = Addr;		Ptr = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);		Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
▲ Show 20 Lines • Show All 1,180 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

This file was added.

				; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s \| FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s

				; Check that an addrspace(1) (const) load with various combinations of
				; uniform, nonuniform and constant address components all load with an
				; addr64 mubuf with no readfirstlane.

				@indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 0.000000e+00>]

				; GCN-LABEL: {{^}}nonuniform_uniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
				.entry:
				%tmp31 = sext i32 %arg18 to i64
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}uniform_nonuniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				nhaehnleUnsubmitted Not Done Reply Inline Actions Could you please add a similar test-case, with a non-uniform i64 %arg18 and %offset a constant? I don't think this case is covered by tests, and I'm not sure that the code would do the right thing for that case, where I think Addr64 would also be needed. nhaehnle: Could you please add a similar test-case, with a non-uniform i64 %arg18 and %offset a constant?
				tprAuthorUnsubmitted Not Done Reply Inline Actions Good spot; that case was not covered. tpr: Good spot; that case was not covered.
				; GCN-LABEL: {{^}}nonuniform_nonuniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}nonuniform_uniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) {
				.entry:
				%tmp31 = sext i32 %arg18 to i64
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

				; GCN-LABEL: {{^}}uniform_nonuniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

				; GCN-LABEL: {{^}}nonuniform_nonuniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151159

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptorClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 151159

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
ClosedPublic