Index: llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -323,7 +323,10 @@ // The isPseudo check really shouldn't be here, but unfortunately there are // some negative lit tests that depend on being able to continue through // here even when pseudo instructions haven't been lowered. - if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU())) { + // + // We also overestimate branch sizes with the offset bug. + if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) && + (!STI.hasOffset3fBug() || !MI->isBranch())) { SmallVector Fixups; SmallVector CodeBytes; raw_svector_ostream CodeStream(CodeBytes); Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2284,7 +2284,7 @@ BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) .addMBB(TBB); if (BytesAdded) - *BytesAdded = 4; + *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; return 1; } @@ -2311,7 +2311,7 @@ fixImplicitOperands(*CondBr); if (BytesAdded) - *BytesAdded = 4; + *BytesAdded = ST.hasOffset3fBug() ? 8 : 4; return 1; } @@ -2328,7 +2328,7 @@ CondReg.setIsKill(Cond[1].isKill()); if (BytesAdded) - *BytesAdded = 8; + *BytesAdded = ST.hasOffset3fBug() ? 16 : 8; return 2; } @@ -6627,8 +6627,16 @@ // If we have a definitive size, we can use it. Otherwise we need to inspect // the operands to know the size. - if (isFixedSize(MI)) - return DescSize; + if (isFixedSize(MI)) { + unsigned Size = DescSize; + + // If we hit the buggy offset, an extra nop will be inserted in MC so + // estimate the worst case. + if (MI.isBranch() && ST.hasOffset3fBug()) + Size += 4; + + return Size; + } // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. Index: llvm/lib/Target/AMDGPU/SOPInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/SOPInstructions.td +++ llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1058,6 +1058,7 @@ let hasSideEffects = 0; let SALU = 1; let SOPP = 1; + let FixedSize = 1; let SchedRW = [WriteSALU]; let UseNamedOperandTable = 1; bits <16> simm16; Index: llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1030 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-s-branch-bits=7 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1010 %s + +; For gfx1010, overestimate the branch size in case we need to insert +; a nop for the buggy offset. + +; GCN-LABEL: long_forward_scc_branch_3f_offset_bug: +; GFX1030: s_cmp_lg_u32 +; GFX1030-NEXT: s_cbranch_scc1 [[ENDBB:BB[0-9]+_[0-9]+]] + +; GFX1010: s_cmp_lg_u32 +; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:BB[0-9]+_[0-9]+]] +; GFX1010: s_getpc_b64 +; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB +; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GFX1010: [[RELAX_BB]]: + +; GCN: v_nop +; GCN: s_sleep +; GCN: s_cbranch_scc1 + +; GCN: [[ENDBB]]: +; GCN: global_store_dword +define amdgpu_kernel void @long_forward_scc_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 { +bb0: + %cmp0 = icmp eq i32 %cnd0, 0 + br i1 %cmp0, label %bb2, label %bb3 + +bb2: + %val = call i32 asm sideeffect + "s_mov_b32 $0, 0 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "=s"() ; 20 * 12 = 240 + call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244 + %cmp1 = icmp eq i32 %val, 0 ; +4 = 248 + br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010) + +bb3: + store volatile i32 %cnd0, i32 addrspace(1)* %arg + ret void +} + +; GCN-LABEL: {{^}}long_forward_exec_branch_3f_offset_bug: +; GFX1030: v_cmp_eq_u32 +; GFX1030: s_and_saveexec_b32 +; GFX1030-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]] + +; GFX1010: v_cmp_eq_u32 +; GFX1010: s_and_saveexec_b32 +; GFX1010-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]] +; GFX1010: s_getpc_b64 +; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB +; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GFX1010: [[RELAX_BB]]: + +; GCN: v_nop +; GCN: s_sleep +; GCN: s_cbranch_execz + +; GCN: [[ENDBB]]: +; GCN: global_store_dword +define void @long_forward_exec_branch_3f_offset_bug(i32 addrspace(1)* %arg, i32 %cnd0) #0 { +bb0: + %cmp0 = icmp eq i32 %cnd0, 0 + br i1 %cmp0, label %bb2, label %bb3 + +bb2: + %val = call i32 asm sideeffect + "v_mov_b32 $0, 0 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64 + v_nop_e64", "=v"() ; 20 * 12 = 240 + call void @llvm.amdgcn.s.sleep(i32 0) ; +4 = 244 + %cmp1 = icmp eq i32 %val, 0 ; +4 = 248 + br i1 %cmp1, label %bb2, label %bb3 ; +4 (gfx1030), +8 with workaround (gfx1010) + +bb3: + store volatile i32 %cnd0, i32 addrspace(1)* %arg + ret void +} + +declare void @llvm.amdgcn.s.sleep(i32 immarg)