Diff 158855

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Show First 20 Lines • Show All 100 Lines • ▼ Show 20 Lines

private:		private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;		std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
bool isNoNanSrc(SDValue N) const;		bool isNoNanSrc(SDValue N) const;
bool isInlineImmediate(const SDNode *N) const;		bool isInlineImmediate(const SDNode *N) const;

bool isUniformBr(const SDNode *N) const;		bool isUniformBr(const SDNode *N) const;

		MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;

SDNode glueCopyToM0(SDNode N) const;		SDNode glueCopyToM0(SDNode N) const;

const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;		const TargetRegisterClass getOperandRegClass(SDNode N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);		virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,		bool isDSOffsetLegal(const SDValue &Base, unsigned Offset,
unsigned OffsetBits) const;		unsigned OffsetBits) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;		bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
▲ Show 20 Lines • Show All 250 Lines • ▼ Show 20 Lines	SDNode AMDGPUDAGToDAGISel::glueCopyToM0(SDNode N) const {
SmallVector <SDValue, 8> Ops;		SmallVector <SDValue, 8> Ops;
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {		for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
Ops.push_back(N->getOperand(i));		Ops.push_back(N->getOperand(i));
}		}
Ops.push_back(Glue);		Ops.push_back(Glue);
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);		return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}		}

		MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
		EVT VT) const {
		SDNode *Lo = CurDAG->getMachineNode(
		AMDGPU::S_MOV_B32, DL, MVT::i32,
		CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
		SDNode *Hi =
		CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
		CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
		const SDValue Ops[] = {
		CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
		SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
		SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};

		return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
		}

static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {		static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {		switch (NumVectorElts) {
case 1:		case 1:
return AMDGPU::SReg_32_XM0RegClassID;		return AMDGPU::SReg_32_XM0RegClassID;
case 2:		case 2:
return AMDGPU::SReg_64RegClassID;		return AMDGPU::SReg_64RegClassID;
case 4:		case 4:
return AMDGPU::SReg_128RegClassID;		return AMDGPU::SReg_128RegClassID;
▲ Show 20 Lines • Show All 169 Lines • ▼ Show 20 Lines	case ISD::ConstantFP: {
if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))		if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();		Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
else {		else {
ConstantSDNode *C = cast<ConstantSDNode>(N);		ConstantSDNode *C = cast<ConstantSDNode>(N);
Imm = C->getZExtValue();		Imm = C->getZExtValue();
}		}

SDLoc DL(N);		SDLoc DL(N);
SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,		ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
CurDAG->getConstant(Imm & 0xFFFFFFFF, DL,
MVT::i32));
SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getConstant(Imm >> 32, DL, MVT::i32));
const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
};

ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
N->getValueType(0), Ops));
return;		return;
}		}
case ISD::LOAD:		case ISD::LOAD:
case ISD::STORE:		case ISD::STORE:
case ISD::ATOMIC_LOAD:		case ISD::ATOMIC_LOAD:
case ISD::ATOMIC_STORE: {		case ISD::ATOMIC_STORE: {
N = glueCopyToM0(N);		N = glueCopyToM0(N);
break;		break;
▲ Show 20 Lines • Show All 428 Lines • ▼ Show 20 Lines	if (!SLC.getNode())
SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);		SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);		TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);

Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);		Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);		Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);		Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);		SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);

		ConstantSDNode *C1 = nullptr;
		SDValue N0 = Addr;
if (CurDAG->isBaseWithConstantOffset(Addr)) {		if (CurDAG->isBaseWithConstantOffset(Addr)) {
SDValue N0 = Addr.getOperand(0);		C1 = cast<ConstantSDNode>(Addr.getOperand(1));
SDValue N1 = Addr.getOperand(1);		if (isUInt<32>(C1->getZExtValue()))
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);		N0 = Addr.getOperand(0);
		else
		C1 = nullptr;
		}

if (N0.getOpcode() == ISD::ADD) {		if (N0.getOpcode() == ISD::ADD) {
		// (add N2, N3) -> addr64, or
// (add (add N2, N3), C1) -> addr64		// (add (add N2, N3), C1) -> addr64
SDValue N2 = N0.getOperand(0);		SDValue N2 = N0.getOperand(0);
SDValue N3 = N0.getOperand(1);		SDValue N3 = N0.getOperand(1);
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);		Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);

		if (N2->isDivergent()) {
		if (N3->isDivergent()) {
		// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
		// addr64, and construct the resource from a 0 address.
		Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
		VAddr = N0;
		} else {
		// N2 is divergent, N3 is not.
		Ptr = N3;
		VAddr = N2;
		}
		} else {
		// N2 is not divergent.
Ptr = N2;		Ptr = N2;
VAddr = N3;		VAddr = N3;
		}
		Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
		} else if (N0->isDivergent()) {
		// N0 is divergent. Use it as the addr64, and construct the resource from a
		// 0 address.
		Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
		VAddr = N0;
		Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
} else {		} else {
// (add N0, C1) -> offset		// N0 -> offset, or
		// (N0 + C1) -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);		VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = N0;		Ptr = N0;
}		}

		if (!C1) {
		// No offset.
		Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
		return true;
		}

if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {		if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
		// Legal offset for instruction.
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);		Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
return true;		return true;
}		}

if (isUInt<32>(C1->getZExtValue())) {
// Illegal offset, store it in soffset.		// Illegal offset, store it in soffset.
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);		Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,		SOffset =
		SDValue(CurDAG->getMachineNode(
		AMDGPU::S_MOV_B32, DL, MVT::i32,
CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),		CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
0);		0);
return true;		return true;
}		}
}

if (Addr.getOpcode() == ISD::ADD) {
// (add N0, N1) -> addr64
SDValue N0 = Addr.getOperand(0);
SDValue N1 = Addr.getOperand(1);
Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
Ptr = N0;
VAddr = N1;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
return true;
}

// default case -> offset
VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
Ptr = Addr;
Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);

return true;
}

bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,		bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,		SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &GLC,		SDValue &Offset, SDValue &GLC,
SDValue &SLC, SDValue &TFE) const {		SDValue &SLC, SDValue &TFE) const {
SDValue Ptr, Offen, Idxen, Addr64;		SDValue Ptr, Offen, Idxen, Addr64;

// addr64 bit was removed for volcanic islands.		// addr64 bit was removed for volcanic islands.
▲ Show 20 Lines • Show All 1,204 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll

	; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=isel -o - %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -stop-after=isel -o - %s \| FileCheck -check-prefix=GCN %s

	; Type legalization for illegal FP type results was dropping invariant			; Type legalization for illegal FP type results was dropping invariant
	; and dereferenceable flags.			; and dereferenceable flags.

	; GCN: BUFFER_LOAD_USHORT_OFFSET killed %{{[0-9]+}}, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4)			; GCN: BUFFER_LOAD_USHORT{{.*}} :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4)
	define half @legalize_f16_load(half addrspace(4)* dereferenceable(4) %ptr) {			define half @legalize_f16_load(half addrspace(4)* dereferenceable(4) %ptr) {
	%load = load half, half addrspace(4)* %ptr, !invariant.load !0			%load = load half, half addrspace(4)* %ptr, !invariant.load !0
	%add = fadd half %load, 1.0			%add = fadd half %load, 1.0
	ret half %add			ret half %add
	}			}

	!0 = !{}			!0 = !{}

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 {
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()		%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*		%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%load = load volatile i32, i32 addrspace(4)* %cast		%load = load volatile i32, i32 addrspace(4)* %cast
ret void		ret void
}		}

; GCN-LABEL: {{^}}func_implicitarg_ptr:		; GCN-LABEL: {{^}}func_implicitarg_ptr:
; GCN: s_waitcnt		; GCN: s_waitcnt
; MESA: s_mov_b64 s[8:9], s[6:7]		; MESA: v_mov_b32_e32 v0, s6
; MESA: s_mov_b32 s11, 0xf000		; MESA: v_mov_b32_e32 v1, s7
; MESA: s_mov_b32 s10, -1		; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; MESA: buffer_load_dword v0, off, s[8:11], 0
; HSA: v_mov_b32_e32 v0, s6		; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7		; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt		; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64		; GCN-NEXT: s_setpc_b64
define void @func_implicitarg_ptr() #0 {		define void @func_implicitarg_ptr() #0 {
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()		%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*		%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%load = load volatile i32, i32 addrspace(4)* %cast		%load = load volatile i32, i32 addrspace(4)* %cast
ret void		ret void
}		}

; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:		; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr:
; GCN: s_waitcnt		; GCN: s_waitcnt
; MESA: s_mov_b64 s[8:9], s[6:7]		; MESA: v_mov_b32_e32 v0, s6
; MESA: s_mov_b32 s11, 0xf000		; MESA: v_mov_b32_e32 v1, s7
; MESA: s_mov_b32 s10, -1		; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; MESA: buffer_load_dword v0, off, s[8:11], 0
; HSA: v_mov_b32_e32 v0, s6		; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7		; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]
; GCN-NEXT: s_waitcnt		; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64		; GCN-NEXT: s_setpc_b64
define void @opencl_func_implicitarg_ptr() #0 {		define void @opencl_func_implicitarg_ptr() #0 {
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()		%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*		%cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
; GCN-NOT: s[6:7]		; GCN-NOT: s[6:7]
define void @opencl_func_call_implicitarg_ptr_func() #0 {		define void @opencl_func_call_implicitarg_ptr_func() #0 {
call void @func_implicitarg_ptr()		call void @func_implicitarg_ptr()
ret void		ret void
}		}

; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:		; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt		; GCN: s_waitcnt
; MESA: s_mov_b64 s[12:13], s[6:7]		; MESA: v_mov_b32_e32 v0, s6
; MESA: s_mov_b32 s15, 0xf000		; MESA: v_mov_b32_e32 v1, s7
; MESA: s_mov_b32 s14, -1		; MESA: v_mov_b32_e32 v2, s8
; MESA: buffer_load_dword v0, off, s[12:15], 0		; MESA: v_mov_b32_e32 v3, s9
		; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6		; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7		; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]
; MESA: s_mov_b32 s10, s14		; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
; MESA: s_mov_b32 s11, s15
; MESA: buffer_load_dword v0, off, s[8:11], 0
; HSA: v_mov_b32_e32 v0, s8		; HSA: v_mov_b32_e32 v0, s8
; HSA: v_mov_b32_e32 v1, s9		; HSA: v_mov_b32_e32 v1, s9
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]

; GCN: s_waitcnt vmcnt(0)		; GCN: s_waitcnt vmcnt(0)
define void @func_kernarg_implicitarg_ptr() #0 {		define void @func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()		%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()		%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
%cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*		%cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
%cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*		%cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
%load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr		%load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
%load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg		%load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
ret void		ret void
}		}

; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:		; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr:
; GCN: s_waitcnt		; GCN: s_waitcnt
; MESA: s_mov_b64 s[12:13], s[6:7]		; MESA: v_mov_b32_e32 v0, s6
; MESA: s_mov_b32 s15, 0xf000		; MESA: v_mov_b32_e32 v1, s7
; MESA: s_mov_b32 s14, -1		; MESA: v_mov_b32_e32 v2, s8
; MESA: buffer_load_dword v0, off, s[12:15], 0		; MESA: v_mov_b32_e32 v3, s9
		; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; HSA: v_mov_b32_e32 v0, s6		; HSA: v_mov_b32_e32 v0, s6
; HSA: v_mov_b32_e32 v1, s7		; HSA: v_mov_b32_e32 v1, s7
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]
; MESA: s_mov_b32 s10, s14		; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64
; MESA: s_mov_b32 s11, s15
; MESA: buffer_load_dword v0, off, s[8:11], 0
; HSA: v_mov_b32_e32 v0, s8		; HSA: v_mov_b32_e32 v0, s8
; HSA: v_mov_b32_e32 v1, s9		; HSA: v_mov_b32_e32 v1, s9
; HSA: flat_load_dword v0, v[0:1]		; HSA: flat_load_dword v0, v[0:1]

; GCN: s_waitcnt vmcnt(0)		; GCN: s_waitcnt vmcnt(0)
define void @opencl_func_kernarg_implicitarg_ptr() #0 {		define void @opencl_func_kernarg_implicitarg_ptr() #0 {
%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()		%kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()		%implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
Show All 34 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll

	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=FUNC %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 -check-prefix=FUNC %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 -check-prefix=VIGFX9 -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 -check-prefix=VIGFX9 -check-prefix=FUNC %s

	declare half @llvm.log.f16(half %a)			declare half @llvm.log.f16(half %a)
	declare <2 x half> @llvm.log.v2f16(<2 x half> %a)			declare <2 x half> @llvm.log.v2f16(<2 x half> %a)

	; FUNC-LABEL: {{^}}log_f16			; FUNC-LABEL: {{^}}log_f16
	; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]			; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]
	; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]			; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]
	; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]			; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]
	; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]]
	; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]			; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
	; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]			; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
	; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3f317218, v[[R_F32_0]]			; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3f317218, v[[R_F32_0]]
	; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_1]]			; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_1]]
	; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]			; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
	; VIGFX9: v_mul_f16_e32 v[[R_F16_0]], 0x398c, v[[R_F16_0]]			; VIGFX9: v_mul_f16_e32 v[[R_F16_0]], 0x398c, v[[R_F16_0]]
	; SI: buffer_store_short v[[R_F16_0]], v{{\[[0-9]+:[0-9]+\]}}			; SI: buffer_store_short v[[R_F16_0]], v{{\[[0-9]+:[0-9]+\]}}
	; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R_F16_0]]			; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R_F16_0]]
	▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll

	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=FUNC %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 -check-prefix=FUNC %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 -check-prefix=VIGFX9 -check-prefix=FUNC %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefix=GFX9 -check-prefix=VIGFX9 -check-prefix=FUNC %s

	declare half @llvm.log10.f16(half %a)			declare half @llvm.log10.f16(half %a)
	declare <2 x half> @llvm.log10.v2f16(<2 x half> %a)			declare <2 x half> @llvm.log10.v2f16(<2 x half> %a)

	; GCN-LABEL: {{^}}log10_f16			; GCN-LABEL: {{^}}log10_f16
	; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]			; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]]
	; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]			; VI: flat_load_ushort v[[A_F16_0:[0-9]+]]
	; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]			; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]]
	; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]]
	; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]			; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]]
	; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]			; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
	; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3e9a209a, v[[R_F32_0]]			; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3e9a209a, v[[R_F32_0]]
	; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_1]]			; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_1]]
	; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]			; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]]
	; VIGFX9: v_mul_f16_e32 v[[R_F16_0]], 0x34d1, v[[R_F16_0]]			; VIGFX9: v_mul_f16_e32 v[[R_F16_0]], 0x34d1, v[[R_F16_0]]
	; SI: buffer_store_short v[[R_F16_0]], v{{\[[0-9]+:[0-9]+\]}}			; SI: buffer_store_short v[[R_F16_0]], v{{\[[0-9]+:[0-9]+\]}}
	; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R_F16_0]]			; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[R_F16_0]]
	▲ Show 20 Lines • Show All 51 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

				; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s \| FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s

				; Check that an addrspace(1) (const) load with various combinations of
				; uniform, nonuniform and constant address components all load with an
				; addr64 mubuf with no readfirstlane.

				@indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> <float 1.000000e+00, float 0.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>, <3 x float> <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 0.000000e+00, float 1.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 0.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 0.000000e+00>]

				; GCN-LABEL: {{^}}nonuniform_uniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @nonuniform_uniform(i32 %arg18) {
				.entry:
				%tmp31 = sext i32 %arg18 to i64
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}uniform_nonuniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}const_nonuniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @const_nonuniform(i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 1
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}nonuniform_nonuniform:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dwordx4 {{.*}} addr64

				define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset
				%tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16
				%tmp34 = extractelement <3 x float> %tmp33, i32 0
				ret float %tmp34
				}

				; GCN-LABEL: {{^}}nonuniform_uniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) {
				.entry:
				%tmp31 = sext i32 %arg18 to i64
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

				; GCN-LABEL: {{^}}uniform_nonuniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

				; GCN-LABEL: {{^}}nonuniform_nonuniform_const:
				; GCN-NOT: readfirstlane
				; SICI: buffer_load_dword {{.*}} addr64

				define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) {
				.entry:
				%tmp1 = zext i32 %arg18 to i64
				%tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)*
				%tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1
				%tmp33 = load float, float addrspace(1)* %tmp32, align 4
				ret float %tmp33
				}

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

	Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines
	; SI: buffer_load_dword [[VBOUND:v[0-9]+]]			; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
	; SI: v_cmp_lt_i32_e32 vcc			; SI: v_cmp_lt_i32_e32 vcc
	; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc			; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
	; SI-NEXT: ; mask branch			; SI-NEXT: ; mask branch
	; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]			; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]

	; Initialize inner condition to false			; Initialize inner condition to false
	; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader			; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
	; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}			; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}}
	; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]

	; Clear exec bits for workitems that load -1s			; Clear exec bits for workitems that load -1s
	; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:			; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]:
	; SI: buffer_load_dword [[B:v[0-9]+]]			; SI: buffer_load_dword [[B:v[0-9]+]]
	; SI: buffer_load_dword [[A:v[0-9]+]]			; SI: buffer_load_dword [[A:v[0-9]+]]
	; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]			; SI-DAG: v_cmp_ne_u32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]]
	; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]			; SI-DAG: v_cmp_ne_u32_e32 [[NEG1_CHECK_1:vcc]], -1, [[B]]
	; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]			; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]]
	▲ Show 20 Lines • Show All 58 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 158855

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptorClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 158855

llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

[AMDGPU] Avoid using divergent value in mubuf addr64 descriptor
ClosedPublic