Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -107,6 +107,8 @@ bool isConstantLoad(const MemSDNode *N, int cbID) const; bool isUniformBr(const SDNode *N) const; + MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -363,6 +365,22 @@ return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64( + SDLoc &DL, uint64_t Imm, EVT VT) const { + SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, + MVT::i32)); + SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) + }; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); +} + static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: @@ -549,19 +567,7 @@ } SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops)); + ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } case ISD::LOAD: @@ -997,6 +1003,7 @@ return true; } + bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, @@ -1030,8 +1037,22 @@ SDValue N2 = N0.getOperand(0); SDValue N3 = N0.getOperand(1); Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N2; - VAddr = N3; + if (N2->isDivergent()) { + if (N3->isDivergent()) { + // Both N2 and N3 are divergent. Keep the add and use N2+N3 as the + // vaddr, and construct the resource out of 0. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; + } else { + // N2 is divergent, N3 is not. + Ptr = N3; + VAddr = N2; + } + } else { + // N2 is not divergent. + Ptr = N2; + VAddr = N3; + } } else { // (add N0, C1) -> offset VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); @@ -1058,8 +1079,23 @@ SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - Ptr = N0; - VAddr = N1; + + if (N0->isDivergent()) { + if (N1->isDivergent()) { + // Both N0 and N1 are divergent. Use the result of the add as the + // addr64, and construct the resource from a 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = Addr; + } else { + // N0 is divergent, N1 is not. + Ptr = N1; + VAddr = N0; + } + } else { + // N0 is not divergent. + Ptr = N0; + VAddr = N1; + } Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } Index: test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s + +; Check that an addrspace(1) (const) load with various combinations of +; uniform, nonuniform and constant address components all load with an +; addr64 mubuf with no readfirstlane. + +@indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> ] + +; GCN-LABEL: {{^}}nonuniform_uniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31 + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}uniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_uniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}uniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + + + +