Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -106,6 +106,8 @@ bool isUniformBr(const SDNode *N) const; + MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const; + SDNode *glueCopyToM0(SDNode *N) const; const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const; @@ -372,6 +374,22 @@ return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops); } +MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm, + EVT VT) const { + SDNode *Lo = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, MVT::i32)); + SDNode *Hi = + CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; + + return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops); +} + static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) { switch (NumVectorElts) { case 1: @@ -557,19 +575,7 @@ } SDLoc DL(N); - SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm & 0xFFFFFFFF, DL, - MVT::i32)); - SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getConstant(Imm >> 32, DL, MVT::i32)); - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), - SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), - SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32) - }; - - ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, - N->getValueType(0), Ops)); + ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0))); return; } case ISD::LOAD: @@ -1014,55 +1020,72 @@ Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ConstantSDNode *C1 = nullptr; + SDValue N0 = Addr; if (CurDAG->isBaseWithConstantOffset(Addr)) { - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); - ConstantSDNode *C1 = cast(N1); + C1 = cast(Addr.getOperand(1)); + if (isUInt<32>(C1->getZExtValue())) + N0 = Addr.getOperand(0); + else + C1 = nullptr; + } + + if (N0.getOpcode() == ISD::ADD) { + // (add N2, N3) -> addr64, or + // (add (add N2, N3), C1) -> addr64 + SDValue N2 = N0.getOperand(0); + SDValue N3 = N0.getOperand(1); + Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); - if (N0.getOpcode() == ISD::ADD) { - // (add (add N2, N3), C1) -> addr64 - SDValue N2 = N0.getOperand(0); - SDValue N3 = N0.getOperand(1); - Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + if (N2->isDivergent()) { + if (N3->isDivergent()) { + // Both N2 and N3 are divergent. Use N0 (the result of the add) as the + // addr64, and construct the resource from a 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; + } else { + // N2 is divergent, N3 is not. + Ptr = N3; + VAddr = N2; + } + } else { + // N2 is not divergent. Ptr = N2; VAddr = N3; - } else { - // (add N0, C1) -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = N0; - } - - if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { - Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); - return true; } - - if (isUInt<32>(C1->getZExtValue())) { - // Illegal offset, store it in soffset. - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); - SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, - CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), - 0); - return true; - } - } - - if (Addr.getOpcode() == ISD::ADD) { - // (add N0, N1) -> addr64 - SDValue N0 = Addr.getOperand(0); - SDValue N1 = Addr.getOperand(1); + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + } else if (N0->isDivergent()) { + // N0 is divergent. Use it as the addr64, and construct the resource from a + // 0 address. + Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0); + VAddr = N0; Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1); + } else { + // N0 -> offset, or + // (N0 + C1) -> offset + VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); Ptr = N0; - VAddr = N1; + } + + if (!C1) { + // No offset. Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); return true; } - // default case -> offset - VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32); - Ptr = Addr; - Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) { + // Legal offset for instruction. + Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16); + return true; + } + // Illegal offset, store it in soffset. + Offset = CurDAG->getTargetConstant(0, DL, MVT::i16); + SOffset = + SDValue(CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)), + 0); return true; } Index: llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll +++ llvm/trunk/test/CodeGen/AMDGPU/legalize-fp-load-invariant.ll @@ -3,7 +3,7 @@ ; Type legalization for illegal FP type results was dropping invariant ; and dereferenceable flags. -; GCN: BUFFER_LOAD_USHORT_OFFSET killed %{{[0-9]+}}, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4) +; GCN: BUFFER_LOAD_USHORT{{.*}} :: (dereferenceable invariant load 2 from %ir.ptr, addrspace 4) define half @legalize_f16_load(half addrspace(4)* dereferenceable(4) %ptr) { %load = load half, half addrspace(4)* %ptr, !invariant.load !0 %add = fadd half %load, 1.0 Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -59,10 +59,9 @@ ; GCN-LABEL: {{^}}func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: s_mov_b64 s[8:9], s[6:7] -; MESA: s_mov_b32 s11, 0xf000 -; MESA: s_mov_b32 s10, -1 -; MESA: buffer_load_dword v0, off, s[8:11], 0 +; MESA: v_mov_b32_e32 v0, s6 +; MESA: v_mov_b32_e32 v1, s7 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] @@ -77,10 +76,9 @@ ; GCN-LABEL: {{^}}opencl_func_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: s_mov_b64 s[8:9], s[6:7] -; MESA: s_mov_b32 s11, 0xf000 -; MESA: s_mov_b32 s10, -1 -; MESA: buffer_load_dword v0, off, s[8:11], 0 +; MESA: v_mov_b32_e32 v0, s6 +; MESA: v_mov_b32_e32 v1, s7 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] @@ -164,16 +162,15 @@ ; GCN-LABEL: {{^}}func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: s_mov_b64 s[12:13], s[6:7] -; MESA: s_mov_b32 s15, 0xf000 -; MESA: s_mov_b32 s14, -1 -; MESA: buffer_load_dword v0, off, s[12:15], 0 +; MESA: v_mov_b32_e32 v0, s6 +; MESA: v_mov_b32_e32 v1, s7 +; MESA: v_mov_b32_e32 v2, s8 +; MESA: v_mov_b32_e32 v3, s9 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] -; MESA: s_mov_b32 s10, s14 -; MESA: s_mov_b32 s11, s15 -; MESA: buffer_load_dword v0, off, s[8:11], 0 +; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s8 ; HSA: v_mov_b32_e32 v1, s9 ; HSA: flat_load_dword v0, v[0:1] @@ -191,16 +188,15 @@ ; GCN-LABEL: {{^}}opencl_func_kernarg_implicitarg_ptr: ; GCN: s_waitcnt -; MESA: s_mov_b64 s[12:13], s[6:7] -; MESA: s_mov_b32 s15, 0xf000 -; MESA: s_mov_b32 s14, -1 -; MESA: buffer_load_dword v0, off, s[12:15], 0 +; MESA: v_mov_b32_e32 v0, s6 +; MESA: v_mov_b32_e32 v1, s7 +; MESA: v_mov_b32_e32 v2, s8 +; MESA: v_mov_b32_e32 v3, s9 +; MESA: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s6 ; HSA: v_mov_b32_e32 v1, s7 ; HSA: flat_load_dword v0, v[0:1] -; MESA: s_mov_b32 s10, s14 -; MESA: s_mov_b32 s11, s15 -; MESA: buffer_load_dword v0, off, s[8:11], 0 +; MESA: buffer_load_dword v0, v[2:3], s[8:11], 0 addr64 ; HSA: v_mov_b32_e32 v0, s8 ; HSA: v_mov_b32_e32 v1, s9 ; HSA: flat_load_dword v0, v[0:1] Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -9,7 +9,6 @@ ; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]] ; VI: flat_load_ushort v[[A_F16_0:[0-9]+]] ; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3f317218, v[[R_F32_0]] Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -9,7 +9,6 @@ ; SI: buffer_load_ushort v[[A_F16_0:[0-9]+]] ; VI: flat_load_ushort v[[A_F16_0:[0-9]+]] ; GFX9: global_load_ushort v[[A_F16_0:[0-9]+]] -; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] ; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x3e9a209a, v[[R_F32_0]] Index: llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll +++ llvm/trunk/test/CodeGen/AMDGPU/shader-addr64-nonuniform.ll @@ -0,0 +1,104 @@ +; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx700 -verify-machineinstrs <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SICI %s + +; Check that an addrspace(1) (const) load with various combinations of +; uniform, nonuniform and constant address components all load with an +; addr64 mubuf with no readfirstlane. + +@indexable = internal unnamed_addr addrspace(1) constant [6 x <3 x float>] [<3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> , <3 x float> ] + +; GCN-LABEL: {{^}}nonuniform_uniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31 + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}uniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}const_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @const_nonuniform(i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 1 + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dwordx4 {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset + %tmp33 = load <3 x float>, <3 x float> addrspace(1)* %tmp32, align 16 + %tmp34 = extractelement <3 x float> %tmp33, i32 0 + ret float %tmp34 +} + +; GCN-LABEL: {{^}}nonuniform_uniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_uniform_const(i32 %arg18) { +.entry: + %tmp31 = sext i32 %arg18 to i64 + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* @indexable, i64 0, i64 %tmp31, i64 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}uniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @uniform_nonuniform_const(i32 inreg %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + +; GCN-LABEL: {{^}}nonuniform_nonuniform_const: +; GCN-NOT: readfirstlane +; SICI: buffer_load_dword {{.*}} addr64 + +define amdgpu_ps float @nonuniform_nonuniform_const(i32 %offset, i32 %arg18) { +.entry: + %tmp1 = zext i32 %arg18 to i64 + %tmp2 = inttoptr i64 %tmp1 to [6 x <3 x float>] addrspace(1)* + %tmp32 = getelementptr [6 x <3 x float>], [6 x <3 x float>] addrspace(1)* %tmp2, i32 0, i32 %offset, i32 1 + %tmp33 = load float, float addrspace(1)* %tmp32, align 4 + ret float %tmp33 +} + + + + Index: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll +++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll @@ -201,8 +201,7 @@ ; Initialize inner condition to false ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader -; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}} -; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] +; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], 0{{$}} ; Clear exec bits for workitems that load -1s ; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: