Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -131,7 +131,7 @@ SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - void SelectMUBUFConstant(SDValue Constant, + bool SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const; bool SelectMUBUFIntrinsicOffset(SDValue Offset, SDValue &SOffset, @@ -1168,7 +1168,7 @@ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE); } -void AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, +bool AMDGPUDAGToDAGISel::SelectMUBUFConstant(SDValue Constant, SDValue &SOffset, SDValue &ImmOffset) const { SDLoc DL(Constant); @@ -1193,6 +1193,13 @@ } } + // There is a hardware bug in SI and CI which prevents address clamping in + // MUBUF instructions from working correctly with SOffsets. The immediate + // offset is unaffected. + if (Overflow > 0 && + Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS) + return false; + ImmOffset = CurDAG->getTargetConstant(Imm, DL, MVT::i16); if (Overflow <= 64) @@ -1201,6 +1208,8 @@ SOffset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, CurDAG->getTargetConstant(Overflow, DL, MVT::i32)), 0); + + return true; } bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicOffset(SDValue Offset, @@ -1211,9 +1220,7 @@ if (!isa(Offset)) return false; - SelectMUBUFConstant(Offset, SOffset, ImmOffset); - - return true; + return SelectMUBUFConstant(Offset, SOffset, ImmOffset); } bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, @@ -1223,20 +1230,30 @@ SDLoc DL(Offset); // Don't generate an unnecessary voffset for constant offsets. - if (isa(Offset)) - return false; + if (isa(Offset)) { + SDValue Tmp1, Tmp2; + + // When necessary, use a voffset in <= CI anyway to work around a hardware + // bug. + if (Subtarget->getGeneration() > AMDGPUSubtarget::SEA_ISLANDS || + SelectMUBUFConstant(Offset, Tmp1, Tmp2)) + return false; + } if (CurDAG->isBaseWithConstantOffset(Offset)) { SDValue N0 = Offset.getOperand(0); SDValue N1 = Offset.getOperand(1); - SelectMUBUFConstant(N1, SOffset, ImmOffset); - VOffset = N0; - } else { - SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); - ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); - VOffset = Offset; + if (cast(N1)->getSExtValue() >= 0 && + SelectMUBUFConstant(N1, SOffset, ImmOffset)) { + VOffset = N0; + return true; + } } + SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32); + ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16); + VOffset = Offset; + return true; } Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -1,9 +1,9 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}test1: ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0 glc -;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) @@ -13,7 +13,8 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc +;SICI: buffer_atomic_swap v0, v1, s[0:3], 0 offen glc +;VI: buffer_atomic_swap v0, off, s[0:3], [[SOFS]] offset:1 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, off, s[0:3], 0{{$}} define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { @@ -70,7 +71,7 @@ ;CHECK-LABEL: {{^}}test3: ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc ;CHECK: s_waitcnt vmcnt(0) -;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff +;VI: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen glc @@ -79,7 +80,8 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) -;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc +;SICI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen glc +;VI: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:1 glc define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}buffer_load: ;CHECK: buffer_load_format_xyzw v[0:3], off, s[0:3], 0 @@ -27,11 +27,15 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;CHECK-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095 -;CHECK-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093 -;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1 +;SICI: v_mov_b32_e32 [[VOFS:v[0-9]+]], 0x103c +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, [[VOFS]], s[0:3], 0 offen +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen +;VI-DAG: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 61 offset:4095 +;VI-DAG: s_movk_i32 [[OFS1:s[0-9]+]], 0x7fff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS1]] offset:4093 +;SICI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 offen +;VI: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS2]] offset:1 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -44,11 +48,11 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_reuse: -;CHECK: s_movk_i32 [[OFS:s[0-9]+]], 0xfff -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65 -;CHECK-NOT: s_mov -;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81 -;CHECK: s_waitcnt +;VI: s_movk_i32 [[OFS:s[0-9]+]], 0xfff +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:65 +;VI-NOT: s_mov +;VI: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[OFS]] offset:81 +;VI: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) Index: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -1,5 +1,5 @@ -;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=SICI +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=VI ;CHECK-LABEL: {{^}}buffer_load: ;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 @@ -27,8 +27,9 @@ } ;CHECK-LABEL: {{^}}buffer_load_immoffs_large: -;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff -;CHECK: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 +;SICI: buffer_load_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 offen +;VI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff +;VI: buffer_load_dwordx4 v[0:3], off, s[0:3], [[OFFSET]] offset:1 ;CHECK: s_waitcnt define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: @@ -101,6 +102,16 @@ ret <2 x float> %data } +;CHECK-LABEL: {{^}}buffer_load_negative_offset: +;CHECK: v_add_i32_e32 [[VOFS:v[0-9]+]], vcc, -16, v0 +;CHECK: buffer_load_dwordx4 v[0:3], [[VOFS]], s[0:3], 0 offen +define amdgpu_ps <4 x float> @buffer_load_negative_offset(<4 x i32> inreg, i32 %ofs) { +main_body: + %ofs.1 = add i32 %ofs, -16 + %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs.1, i1 0, i1 0) + ret <4 x float> %data +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0