Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -155,8 +155,8 @@ bool SelectFlatOffset(SDValue Addr, SDValue &VAddr, SDValue &Offset, SDValue &SLC) const; - bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, - bool &Imm) const; + bool SelectSMRDOffset(SDValue ByteOffsetNode, bool Addr32Bit, + SDValue &Offset, bool &Imm) const; SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const; @@ -1346,7 +1346,8 @@ } bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, - SDValue &Offset, bool &Imm) const { + bool Addr32Bit, SDValue &Offset, + bool &Imm) const { // FIXME: Handle non-constant offsets. ConstantSDNode *C = dyn_cast(ByteOffsetNode); @@ -1358,6 +1359,14 @@ int64_t ByteOffset = C->getSExtValue(); int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset); + // A 32-bit (address + offset) can wrap around, while the same expression + // might not wrap around in 64 bits. We need to fix this case for tests that + // do (Addr - BigConst + BigConst), so assume that a "sufficiently small" + // offset never causes a wraparound. The chosen number is a guess. Address + // wraparounds are never expected to occur in normal apps. + if (Addr32Bit && (uint64_t)ByteOffset > 256 * 1024) + return false; + if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) { Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32); Imm = true; @@ -1412,7 +1421,7 @@ SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - if (SelectSMRDOffset(N1, Offset, Imm)) { + if (SelectSMRDOffset(N1, Addr.getValueType() == MVT::i32, Offset, Imm)) { SBase = Expand32BitAddress(N0); return true; } @@ -1452,7 +1461,7 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const { bool Imm; - return SelectSMRDOffset(Addr, Offset, Imm) && Imm; + return SelectSMRDOffset(Addr, false, Offset, Imm) && Imm; } bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr, @@ -1461,7 +1470,7 @@ return false; bool Imm; - if (!SelectSMRDOffset(Addr, Offset, Imm)) + if (!SelectSMRDOffset(Addr, false, Offset, Imm)) return false; return !Imm && isa(Offset); Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -1,5 +1,5 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI,CI %s ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s @@ -268,6 +268,30 @@ ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43 } +; GCN-LABEL: {{^}}load_addr_fold: +; GCN-DAG: s_mov_b32 s1, 0 +; SI-DAG: s_mov_b32 s2, 0x40000 +; SI: s_load_dword s{{[0-9]}}, s[0:1], s2 +; CI: s_load_dword s{{[0-9]}}, s[0:1], 0x10000 +; VIGFX9: s_load_dword s{{[0-9]}}, s[0:1], 0x40000 +define amdgpu_vs float @load_addr_fold(i32 addrspace(6)* inreg %p0) #0 { + %gep1 = getelementptr i32, i32 addrspace(6)* %p0, i64 65536 + %r1 = load i32, i32 addrspace(6)* %gep1 + %r2 = bitcast i32 %r1 to float + ret float %r2 +} + +; GCN-LABEL: {{^}}load_addr_no_fold: +; GCN-DAG: s_add_i32 s0, s0, 0x40004 +; GCN-DAG: s_mov_b32 s1, 0 +; GCN: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +define amdgpu_vs float @load_addr_no_fold(i32 addrspace(6)* inreg %p0) #0 { + %gep1 = getelementptr i32, i32 addrspace(6)* %p0, i64 65537 + %r1 = load i32, i32 addrspace(6)* %gep1 + %r2 = bitcast i32 %r1 to float + ret float %r2 +} + ; Function Attrs: nounwind readnone speculatable declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6