Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -328,12 +328,12 @@ return HasMadMixInsts; } - bool hasSBufferLoadStoreAtomicDwordxN() const { + bool hasBuggySBufferLoadStoreAtomicxN() const { // Only use the "x1" variants on GFX9 or don't use the buffer variants. // For x2 and higher variants, if the accessed region spans 2 VM pages and // the second page is unmapped, the hw hangs. // TODO: There is one future GFX9 chip that doesn't have this bug. - return getGeneration() != GFX9; + return getGeneration() == GFX9; } bool hasCARRY() const { Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -804,6 +804,8 @@ // the same base register. We rely on the scheduler to do the hard work of // clustering nearby loads, and assume these are all adjacent. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { + const SIMachineFunctionInfo *MFI = + MBB.getParent()->getInfo(); bool Modified = false; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { @@ -849,7 +851,8 @@ continue; } - if (STM->hasSBufferLoadStoreAtomicDwordxN() && + if ((!STM->hasBuggySBufferLoadStoreAtomicxN() || + MFI->shrinkBuggySBufferLoadStoreAtomicxN()) && (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM || Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) { // EltSize is in units of the offset encoding. Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -181,6 +181,14 @@ // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; + // This converts s_buffer_xxx to s_xxx to allow xN loads on chips where + // the buffer opcodes are buggy, but at the cost of removing bounds checking + // that is provided by buffer opcodes. + // + // Constraint: Only the BASE_ADDRESS_HI field of WORD1 can be set, so that + // WORD0:WORD1 can trivially be used as an address. + bool ShrinkBuggySBufferLoadStoreAtomicxN : 1; + // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since // current hardware only allows a 16 bit value. @@ -392,6 +400,10 @@ return ImplicitBufferPtr; } + unsigned shrinkBuggySBufferLoadStoreAtomicxN() const { + return ShrinkBuggySBufferLoadStoreAtomicxN; + } + AMDGPUFunctionArgInfo &getArgInfo() { return ArgInfo; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -47,6 +47,7 @@ WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), + ShrinkBuggySBufferLoadStoreAtomicxN(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const SISubtarget &ST = MF.getSubtarget(); @@ -161,6 +162,9 @@ FlatScratchInit = true; } + if (F.hasFnAttribute("amdgpu-shrink-buggy-sbuffer-opcodes")) + ShrinkBuggySBufferLoadStoreAtomicxN = true; + Attribute A = F.getFnAttribute("amdgpu-git-ptr-high"); StringRef S = A.getValueAsString(); if (!S.empty()) Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -293,6 +293,7 @@ const SISubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); std::vector I1Defs; @@ -305,6 +306,51 @@ Next = std::next(I); MachineInstr &MI = *I; + // Shrink buggy scalar buffer loads. + if (ST.hasBuggySBufferLoadStoreAtomicxN() && + MFI->shrinkBuggySBufferLoadStoreAtomicxN() && + TII->isSMRD(MI.getOpcode())) { + unsigned NewOpcode = 0; + + // No other s_buffer opcodes can be generated by LLVM at the moment. + switch (MI.getOpcode()) { + case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM: + NewOpcode = AMDGPU::S_LOAD_DWORDX2_IMM; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM: + NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM: + NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM: + NewOpcode = AMDGPU::S_LOAD_DWORDX16_IMM; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR: + NewOpcode = AMDGPU::S_LOAD_DWORDX2_SGPR; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR: + NewOpcode = AMDGPU::S_LOAD_DWORDX4_SGPR; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR: + NewOpcode = AMDGPU::S_LOAD_DWORDX8_SGPR; + break; + case AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR: + NewOpcode = AMDGPU::S_LOAD_DWORDX16_SGPR; + break; + default: + continue; + } + + unsigned SAddr = TII->buildExtractSubReg(MI, MRI, MI.getOperand(1), + &AMDGPU::SReg_128RegClass, + AMDGPU::sub0_sub1, + &AMDGPU::SReg_64_XEXECRegClass); + MI.setDesc(TII->get(NewOpcode)); + MI.getOperand(1).setReg(SAddr); + continue; + } + if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { // If this has a literal constant source that is the same as the // reversed bits of an inline immediate, replace with a bitreverse of Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -238,6 +238,27 @@ ret void } +; GCN-LABEL: {{^}}smrd_imm_merged_shrunk: +; GCN-NEXT: %bb. +; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1 +; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7 +; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4 +; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c +; GFX9-NEXT: s_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:1], 0x4 +; GFX9-NEXT: s_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:1], 0x1c +define amdgpu_ps void @smrd_imm_merged_shrunk(<4 x i32> inreg %desc) #2 { +main_body: + %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4) + %r2 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 8) + %r3 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 12) + %r4 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 16) + %r5 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 28) + %r6 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 32) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0 + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0 + ret void +} + ; GCN-LABEL: {{^}}smrd_vgpr_merged: ; GCN-NEXT: %bb. ; GCN-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4 @@ -266,3 +287,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "amdgpu-shrink-buggy-sbuffer-opcodes" }