Index: lib/Target/AMDGPU/R600ISelLowering.h =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.h +++ lib/Target/AMDGPU/R600ISelLowering.h @@ -98,9 +98,11 @@ bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; - bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, - SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, - SelectionDAG &DAG) const; + bool FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, + SDValue &Neg, SDValue &Abs, SDValue &Sel, SDValue &Imm, + SelectionDAG &DAG) const; + SDValue constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const; SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override; }; Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -903,7 +903,7 @@ unsigned DwordOffset) const { unsigned ByteOffset = DwordOffset * 4; PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUASI.PARAM_I_ADDRESS); // We shouldn't be using an offset wider than 16-bits for implicit parameters. assert(isInt<16>(ByteOffset)); @@ -1482,33 +1482,17 @@ return scalarizeVectorLoad(LoadNode, DAG); } + // This is still used for explicit load from addrspace(8) int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace()); if (ConstantBlock > -1 && ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) || (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) { SDValue Result; - if (isa(LoadNode->getMemOperand()->getValue()) || - isa(LoadNode->getMemOperand()->getValue()) || + if (isa(LoadNode->getMemOperand()->getValue()) || isa(Ptr)) { - SDValue Slots[4]; - for (unsigned i = 0; i < 4; i++) { - // We want Const position encoded with the following formula : - // (((512 + (kc_bank << 12) + const_index) << 2) + chan) - // const_index is Ptr computed by llvm using an alignment of 16. - // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and - // then div by 4 at the ISel step - SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, - DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); - Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); - } - EVT NewVT = MVT::v4i32; - unsigned NumElements = 4; - if (VT.isVector()) { - NewVT = VT; - NumElements = VT.getVectorNumElements(); - } - Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + return constBufferLoad(LoadNode, LoadNode->getAddressSpace(), DAG); } else { + //TODO: Does this even work? // non-constant ptr can't be folded, keeps it as a v4f32 load Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32, DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, @@ -1647,7 +1631,7 @@ } PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()), - AMDGPUASI.CONSTANT_BUFFER_0); + AMDGPUASI.PARAM_I_ADDRESS); // i64 isn't a legal type, so the register type used ends up as i32, which // isn't expected here. It attempts to create this sextload, but it ends up @@ -1671,17 +1655,17 @@ unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); + unsigned Alignment = MinAlign(VT.getStoreSize(), PartOffset); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, - MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | + MemVT, Alignment, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); - // 4 is the preferred alignment for the CONSTANT memory space. InVals.push_back(Arg); } return Chain; @@ -1829,6 +1813,52 @@ return BuildVector; } +SDValue R600TargetLowering::constBufferLoad(LoadSDNode *LoadNode, int Block, + SelectionDAG &DAG) const { + SDLoc DL(LoadNode); + EVT VT = LoadNode->getValueType(0); + SDValue Chain = LoadNode->getChain(); + SDValue Ptr = LoadNode->getBasePtr(); + assert (isa(Ptr)); + + //TODO: Support smaller loads + if (LoadNode->getMemoryVT().getScalarType() != MVT::i32 || !ISD::isNON_EXTLoad(LoadNode)) + return SDValue(); + + if (LoadNode->getAlignment() < 4) + return SDValue(); + + int ConstantBlock = ConstantAddressBlock(Block); + + SDValue Slots[4]; + for (unsigned i = 0; i < 4; i++) { + // We want Const position encoded with the following formula : + // (((512 + (kc_bank << 12) + const_index) << 2) + chan) + // const_index is Ptr computed by llvm using an alignment of 16. + // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and + // then div by 4 at the ISel step + SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, + DAG.getConstant(4 * i + ConstantBlock * 16, DL, MVT::i32)); + Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr); + } + EVT NewVT = MVT::v4i32; + unsigned NumElements = 4; + if (VT.isVector()) { + NewVT = VT; + NumElements = VT.getVectorNumElements(); + } + SDValue Result = DAG.getBuildVector(NewVT, DL, makeArrayRef(Slots, NumElements)); + if (!VT.isVector()) { + Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result, + DAG.getConstant(0, DL, MVT::i32)); + } + SDValue MergedValues[2] = { + Result, + Chain + }; + return DAG.getMergeValues(MergedValues, DL); +} + //===----------------------------------------------------------------------===// // Custom DAG Optimizations //===----------------------------------------------------------------------===// @@ -2047,6 +2077,16 @@ NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG, DL); return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, N->getVTList(), NewArgs); } + + case ISD::LOAD: { + LoadSDNode *LoadNode = cast(N); + SDValue Ptr = LoadNode->getBasePtr(); + if (LoadNode->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS && + isa(Ptr)) + return constBufferLoad(LoadNode, AMDGPUAS::CONSTANT_BUFFER_0, DAG); + break; + } + default: break; } Index: test/CodeGen/AMDGPU/kernel-args.ll =================================================================== --- test/CodeGen/AMDGPU/kernel-args.ll +++ test/CodeGen/AMDGPU/kernel-args.ll @@ -16,13 +16,8 @@ ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff -; EG: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: MOV * T1.X, KC0[2].Z, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) - -; CM: LSHR * T0.X, KC0[2].Y, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: MOV * T1.X, KC0[2].Z, +; EGCM: VTX_READ_8{{.*}} #3 +; EGCM: KC0[2].Y define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind { %ext = zext i8 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 @@ -92,14 +87,8 @@ ; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}} ; HSA-VI: flat_store_dword - -; EG: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: MOV * T1.X, KC0[2].Z, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) - -; CM: LSHR * T0.X, KC0[2].Y, literal.x, -; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; CM-NEXT: MOV * T1.X, KC0[2].Z, +; EGCM: VTX_READ_16 +; EGCM: KC0[2].Y define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind { %ext = zext i16 %in to i32 store i32 %ext, i32 addrspace(1)* %out, align 4 Index: test/CodeGen/AMDGPU/r600.extract-lowbits.ll =================================================================== --- test/CodeGen/AMDGPU/r600.extract-lowbits.ll +++ test/CodeGen/AMDGPU/r600.extract-lowbits.ll @@ -16,8 +16,8 @@ ; ---------------------------------------------------------------------------- ; ; R600-LABEL: bzhi32_a0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_a0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %onebit = shl i32 1, %numlowbits @@ -28,9 +28,9 @@ } ; R600-LABEL: bzhi32_a1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] +; R600: BFE_UINT {{[ *]*}}[[RET]], KC0[2].Y, 0.0, define amdgpu_kernel void @bzhi32_a1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { %conv = zext i8 %numlowbits to i32 %onebit = shl i32 1, %conv @@ -41,8 +41,8 @@ } ; R600-LABEL: bzhi32_a4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_a4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %onebit = shl i32 1, %numlowbits @@ -57,8 +57,8 @@ ; ---------------------------------------------------------------------------- ; ; R600-LABEL: bzhi32_b0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_b0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %notmask = shl i32 -1, %numlowbits @@ -69,9 +69,9 @@ } ; R600-LABEL: bzhi32_b1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] +; R600: BFE_UINT {{[* ]*}}[[RET]], KC0[2].Y, 0.0 define amdgpu_kernel void @bzhi32_b1_indexzext(i32 %val, i8 zeroext %numlowbits, i32 addrspace(1)* %out) { %conv = zext i8 %numlowbits to i32 %notmask = shl i32 -1, %conv @@ -82,8 +82,8 @@ } ; R600-LABEL: bzhi32_b4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_b4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %notmask = shl i32 -1, %numlowbits @@ -98,8 +98,8 @@ ; ---------------------------------------------------------------------------- ; ; R600-LABEL: bzhi32_c0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_c0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits @@ -110,9 +110,9 @@ } ; R600-LABEL: bzhi32_c1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal.x, KC0[2].Z +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] +; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal ; R600-NEXT: 32 ; R600-NEXT: AND_INT {{\*?}} {{T[0-9]+}}.[[AND1C:[XYZW]]], {{T[0-9]+|PV}}.[[SUBC]], literal.x ; R600-NEXT: 255 @@ -129,8 +129,8 @@ } ; R600-LABEL: bzhi32_c4_commutative: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_c4_commutative(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits @@ -145,8 +145,8 @@ ; ---------------------------------------------------------------------------- ; ; R600-LABEL: bzhi32_d0: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] ; R600: BFE_UINT {{\*?}} [[RET]], KC0[2].Y, 0.0, KC0[2].Z define amdgpu_kernel void @bzhi32_d0(i32 %val, i32 %numlowbits, i32 addrspace(1)* %out) { %numhighbits = sub i32 32, %numlowbits @@ -157,9 +157,9 @@ } ; R600-LABEL: bzhi32_d1_indexzext: -; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-1]+\.[XYZW]]] -; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-1]+\.[XYZW]]] -; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal.x, KC0[2].Z +; EG: MEM_RAT_CACHELESS STORE_RAW [[RET:T[0-9]+\.[XYZW]]] +; CM: MEM_RAT_CACHELESS STORE_DWORD [[RET:T[0-9]+\.[XYZW]]] +; R600: SUB_INT {{\*?}} [[SUBR:T[0-9]+]].[[SUBC:[XYZW]]], literal ; R600-NEXT: 32 ; R600-NEXT: AND_INT {{\*?}} [[AND:T[0-9]+\.[XYZW]]], {{T[0-9]+|PV}}.[[SUBC]], literal.x ; R600-NEXT: 255 Index: test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll =================================================================== --- test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll +++ test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll @@ -60,8 +60,11 @@ } ; FUNC-LABEL: {{^}}test_implicit: -; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 -; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56 +; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56 == KC0[3].Z +; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]], [[PTR:T[0-9]+.[XYZW]]] +; EG-NOT: VTX_READ +; EG-DAG: MOV {{\*?}} [[VAL]], KC0[3].Z +; EG-DAG: LSHR {{\*? *}}[[PTR]], KC0[2].Y, literal define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)* @@ -73,7 +76,7 @@ ; FUNC-LABEL: {{^}}test_implicit_dyn: ; 36 prepended implicit bytes + 8(out pointer + in) = 44 -; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44 +; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44, #3 define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 { %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr() %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*