Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -106,6 +106,8 @@ bool isUniformBr(const SDNode *N) const; + MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -372,6 +374,22 @@ return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, + EVT VT) const { + SDNode *Lo = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); + SDNode *Hi = + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); +} + static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: @@ -557,19 +575,7 @@ } SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops)); + ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } case ISD::LOAD: @@ -1000,55 +1006,66 @@ Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ConstantSDNode *C1 = nullptr; + SDValue N0 = Addr; if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); + C1 = cast(Addr.getOperand(1)); + if (isUInt<32>(C1->getZExtValue())) + N0 = Addr.getOperand(0); + else + C1 = nullptr; + } + + if (N0.getOpcode() == ISD::ADD) { + // (add N2, N3) -> addr64, or + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + if (N2->isDivergent()) { + if (N3->isDivergent()) { + // Both N2 and N3 are divergent. Use N0 (the result of the add) as the + // addr64, and construct the resource from a 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; + } else { + // N2 is divergent, N3 is not. + Ptr = N3; + VAddr = N2; + } + } else { + // N2 is not divergent. Ptr = N2; VAddr = N3; - } else { - // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = N0; - } - - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; - } - - if (isUInt<32>(C1->getZExtValue())) { - // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), - 0); - return true; } + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + } else { + // N0 -> offset, or + // (N0 + C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); + Ptr = N0; } - if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) -> addr64 - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N0; - VAddr = N1; + if (!C1) { + // No offset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } - // default case -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + // Legal offset for instruction. + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = + SDValue(CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); return true; } Index: test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s + +; Check that an addrspace(1) (const) load with various combinations of +; uniform, nonuniform and constant address components all load with an +; addr64 mubuf with no readfirstlane. + +@indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> ] + +; GCN-LABEL: {{^}}nonuniform_uniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31 + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}uniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_uniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}uniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + + + +