diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -188,6 +188,8 @@ SDValue &VOffset, SDValue &Offset) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; + bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, + uint64_t ImmOffset) const; bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1795,6 +1795,24 @@ return true; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUDAGToDAGISel::checkFlatScratchSVSSwizzleBug( + SDValue VAddr, SDValue SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + KnownBits VKnown = CurDAG->computeKnownBits(VAddr); + KnownBits SKnown = KnownBits::computeForAddSub( + true, false, CurDAG->computeKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &SAddr, SDValue &Offset) const { @@ -1822,6 +1840,8 @@ CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32)); VAddr = SDValue(VMov, 0); SAddr = LHS; + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset)) + return false; Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16); return true; } @@ -1844,6 +1864,8 @@ return false; } + if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset)) + return false; SAddr = SelectSAddrFI(CurDAG, SAddr); Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -210,6 +210,8 @@ InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; + bool checkFlatScratchSVSSwizzleBug(Register VAddr, Register SAddr, + uint64_t ImmOffset) const; InstructionSelector::ComplexRendererFns selectScratchSVAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3980,6 +3980,24 @@ }}; } +// Check whether the flat scratch SVS swizzle bug affects this access. +bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug( + Register VAddr, Register SAddr, uint64_t ImmOffset) const { + if (!Subtarget->hasFlatScratchSVSSwizzleBug()) + return false; + + // The bug affects the swizzling of SVS accesses if there is any carry out + // from the two low order bits (i.e. from bit 1 into bit 2) when adding + // voffset to (soffset + inst_offset). + auto VKnown = KnownBits->getKnownBits(VAddr); + auto SKnown = KnownBits::computeForAddSub( + true, false, KnownBits->getKnownBits(SAddr), + KnownBits::makeConstant(APInt(32, ImmOffset))); + uint64_t VMax = VKnown.getMaxValue().getZExtValue(); + uint64_t SMax = SKnown.getMaxValue().getZExtValue(); + return (VMax & 3) + (SMax & 3) >= 4; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); @@ -4008,6 +4026,9 @@ Register LHS = AddrDef->MI->getOperand(1).getReg(); auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI); + if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset)) + return None; + if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) { int FI = LHSDef->MI->getOperand(1).getIndex(); return {{ diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1044,6 +1044,8 @@ bool hasVOPDInsts() const { return HasVOPDInsts; } + bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } + /// Return true if the target has the S_DELAY_ALU instruction. bool hasDelayAlu() const { return GFX11Insts; } diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -51,12 +51,12 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -132,17 +132,17 @@ ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -311,12 +311,12 @@ ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -395,18 +395,18 @@ ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, off offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -576,16 +576,17 @@ ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v2, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v2, v3, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; @@ -664,18 +665,19 @@ ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 +; GFX11-SDAG-NEXT: v_add3_u32 v3, 4, s0, v0 ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v2, s0 offset:2 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v3, v2, off offset:2 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-SDAG-NEXT: scratch_store_b8 v0, v3, s0 offset:4 dlc +; GFX11-SDAG-NEXT: scratch_store_b8 v0, v4, s0 offset:4 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-SDAG-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4091,11 +4091,11 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -4149,11 +4149,11 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: s_movk_i32 s0, 0xef7f -; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, s0 dlc +; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, s0 glc dlc +; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31] bb: