diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp @@ -41,9 +41,6 @@ const TargetSubtargetInfo &ST; const AsmPrinter &AP; - const MCExpr *getLongBranchBlockExpr(const MachineBasicBlock &SrcBB, - const MachineOperand &MO) const; - public: AMDGPUMCInstLower(MCContext &ctx, const TargetSubtargetInfo &ST, const AsmPrinter &AP); @@ -95,54 +92,21 @@ } } -const MCExpr *AMDGPUMCInstLower::getLongBranchBlockExpr( - const MachineBasicBlock &SrcBB, - const MachineOperand &MO) const { - const MCExpr *DestBBSym - = MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx); - const MCExpr *SrcBBSym = MCSymbolRefExpr::create(SrcBB.getSymbol(), Ctx); - - // FIXME: The first half of this assert should be removed. This should - // probably be PC relative instead of using the source block symbol, and - // therefore the indirect branch expansion should use a bundle. - assert( - skipDebugInstructionsForward(SrcBB.begin(), SrcBB.end())->getOpcode() == - AMDGPU::S_GETPC_B64 && - ST.getInstrInfo()->get(AMDGPU::S_GETPC_B64).Size == 4); - - // s_getpc_b64 returns the address of next instruction. - const MCConstantExpr *One = MCConstantExpr::create(4, Ctx); - SrcBBSym = MCBinaryExpr::createAdd(SrcBBSym, One, Ctx); - - if (MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_FORWARD) - return MCBinaryExpr::createSub(DestBBSym, SrcBBSym, Ctx); - - assert(MO.getTargetFlags() == SIInstrInfo::MO_LONG_BRANCH_BACKWARD); - return MCBinaryExpr::createSub(SrcBBSym, DestBBSym, Ctx); -} - bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const { switch (MO.getType()) { default: - llvm_unreachable("unknown operand type"); + break; case MachineOperand::MO_Immediate: MCOp = MCOperand::createImm(MO.getImm()); return true; case MachineOperand::MO_Register: MCOp = MCOperand::createReg(AMDGPU::getMCReg(MO.getReg(), ST)); return true; - case MachineOperand::MO_MachineBasicBlock: { - if (MO.getTargetFlags() != 0) { - MCOp = MCOperand::createExpr( - getLongBranchBlockExpr(*MO.getParent()->getParent(), MO)); - } else { - MCOp = MCOperand::createExpr( + case MachineOperand::MO_MachineBasicBlock: + MCOp = MCOperand::createExpr( MCSymbolRefExpr::create(MO.getMBB()->getSymbol(), Ctx)); - } - return true; - } case MachineOperand::MO_GlobalAddress: { const GlobalValue *GV = MO.getGlobal(); SmallString<128> SymbolName; @@ -168,7 +132,15 @@ case MachineOperand::MO_RegisterMask: // Regmasks are like implicit defs. return false; + case MachineOperand::MO_MCSymbol: + if (MO.getTargetFlags() == SIInstrInfo::MO_FAR_BRANCH_OFFSET) { + MCSymbol *Sym = MO.getMCSymbol(); + MCOp = MCOperand::createExpr(Sym->getVariableValue()); + return true; + } + break; } + llvm_unreachable("unknown operand type"); } void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -161,8 +161,7 @@ // MO_REL32_HI -> symbol@rel32@hi -> R_AMDGPU_REL32_HI. MO_REL32_HI = 5, - MO_LONG_BRANCH_FORWARD = 6, - MO_LONG_BRANCH_BACKWARD = 7, + MO_FAR_BRANCH_OFFSET = 6, MO_ABS32_LO = 8, MO_ABS32_HI = 9, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -23,6 +23,7 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/MC/MCContext.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Support/CommandLine.h" @@ -2229,32 +2230,36 @@ // s_getpc_b64. Insert pc arithmetic code before last terminator. MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg); - // TODO: Handle > 32-bit block address. - if (BrOffset >= 0) { - BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) - .addReg(PCReg, RegState::Define, AMDGPU::sub0) - .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD); - BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) - .addReg(PCReg, RegState::Define, AMDGPU::sub1) - .addReg(PCReg, 0, AMDGPU::sub1) - .addImm(0); - } else { - // Backwards branch. - BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32)) + auto &MCCtx = MF->getContext(); + MCSymbol *PostGetPCLabel = + MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true); + GetPC->setPostInstrSymbol(*MF, PostGetPCLabel); + + MCSymbol *OffsetLo = + MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true); + MCSymbol *OffsetHi = + MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub0) .addReg(PCReg, 0, AMDGPU::sub0) - .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD); - BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32)) + .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET); + BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32)) .addReg(PCReg, RegState::Define, AMDGPU::sub1) .addReg(PCReg, 0, AMDGPU::sub1) - .addImm(0); - } + .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET); // Insert the indirect branch after the other terminator. BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64)) .addReg(PCReg); + auto ComputeBlockSize = [](const TargetInstrInfo *TII, + const MachineBasicBlock &MBB) { + unsigned Size = 0; + for (const MachineInstr &MI : MBB) + Size += TII->getInstSizeInBytes(MI); + return Size; + }; + // FIXME: If spilling is necessary, this will fail because this scavenger has // no emergency stack slots. It is non-trivial to spill in this situation, // because the restore code needs to be specially placed after the @@ -2299,7 +2304,16 @@ MRI.clearVirtRegs(); RS->setRegUsed(Scav); - return 4 + 8 + 4 + 4; + // Now, the distance could be defined. + auto *Offset = MCBinaryExpr::createSub( + MCSymbolRefExpr::create(DestBB.getSymbol(), MCCtx), + MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx); + // Add offset assignments. + auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx); + OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx)); + auto *ShAmt = MCConstantExpr::create(32, MCCtx); + OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx)); + return ComputeBlockSize(this, MBB); } unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relax-bundle.ll @@ -20,6 +20,7 @@ ; GCN-LABEL: {{^}}bundle_size: ; GCN: s_cbranch_scc0 [[BB_EXPANSION:BB[0-9]+_[0-9]+]] ; GCN: s_getpc_b64 +; GCN-NEXT: .Lpost_getpc{{[0-9]+}}:{{$}} ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir @@ -8,8 +8,9 @@ # GCN-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- [DW_OP_plus_uconst 12, DW_OP_stack_value] # GCN-NEXT: .loc 1 0 42 is_stmt 0 ; /tmp/test_debug_value.cl:0:42 # GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -# GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], BB0_4-(BB0_5+4) -# GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0 +# GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +# GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], (BB0_4-[[POST_GETPC]])&4294967295 +# GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], (BB0_4-[[POST_GETPC]])>>32 # GCN-NEXT: s_setpc_b64 --- | diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx10-branch-offset-bug.ll @@ -11,8 +11,9 @@ ; GFX1010: s_cmp_lg_u32 ; GFX1010-NEXT: s_cbranch_scc0 [[RELAX_BB:BB[0-9]+_[0-9]+]] ; GFX1010: s_getpc_b64 -; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB -; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GFX1010-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GFX1010-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GFX1010-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32 ; GFX1010: [[RELAX_BB]]: ; GCN: v_nop @@ -59,8 +60,9 @@ ; GFX1010-NEXT: s_cbranch_execnz [[RELAX_BB:BB[0-9]+_[0-9]+]] ; GCN: s_getpc_b64 -; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[ENDBB:BB[0-9]+_[0-9]+]]-(BB -; GCN-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32 ; GCN: [[RELAX_BB]]: ; GCN: v_nop diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -60,10 +60,11 @@ ; GCN: s_cmp_eq_u32 [[CND]], 0 ; GCN-NEXT: s_cbranch_scc0 [[LONGBB:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4) -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0 +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[LONGBB]]: @@ -104,10 +105,11 @@ ; GCN-DAG: s_and_b64 vcc, exec, [[UNMASKED]] ; GCN: s_cbranch_vccz [[LONGBB:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[ENDBB:BB[0-9]+_[0-9]+]]-([[LONG_JUMP]]+4) -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0 +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[ENDBB:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[ENDBB]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[LONGBB]]: @@ -190,12 +192,13 @@ ; GCN-NEXT: s_cbranch_scc0 [[ENDBB:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2 +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb2 ; GCN-NEXT: ; in Loop: Header=[[LOOPBB]] Depth=1 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_JUMP]]+4)-[[LOOPBB]] -; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0 +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOPBB]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOPBB]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[ENDBB]]: @@ -226,10 +229,11 @@ ; GCN: s_cmp_eq_u32 ; GCN: s_cbranch_scc{{[0-1]}} [[BB2:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONG_JUMP0:BB[0-9]+_[0-9]+]]: ; %bb0 +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %bb0 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], [[BB3:BB[0-9]_[0-9]+]]-([[LONG_JUMP0]]+4) -; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], 0{{$}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC0_LO]], s[[PC0_LO]], ([[BB3:BB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC0_HI]], s[[PC0_HI]], ([[BB3:BB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC0_LO]]:[[PC0_HI]]{{\]}} ; GCN: [[BB2]]: ; %bb3 @@ -282,12 +286,13 @@ ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %loop ; GCN-NEXT: ; in Loop: Header=[[LOOP]] Depth=1 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP]] -; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOP]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOP]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: .Lfunc_end{{[0-9]+}}: define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) { @@ -318,10 +323,11 @@ ; GCN-NEXT: v_cmp_{{eq|ne}}_u32_e64 ; GCN: s_cbranch_vccz [[BB2:BB[0-9]_[0-9]+]] -; GCN-NEXT: [[LONGBB1:BB[0-9]+_[0-9]+]]: +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC1_LO:[0-9]+]]:[[PC1_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], [[BB3:BB[0-9]+_[0-9]+]]-([[LONGBB1]]+4) -; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], 0{{$}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC1_LO]], s[[PC1_LO]], ([[BB3:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC1_HI]], s[[PC1_HI]], ([[BB3:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC1_LO]]:[[PC1_HI]]{{\]}} ; GCN-NEXT: [[BB2]]: ; %bb2 @@ -376,10 +382,11 @@ ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %entry +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %entry ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], [[BB2:BB[0-9]_[0-9]+]]-([[LONGBB]]+4) -; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], 0{{$}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[BB2:BB[0-9]_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[BB2:BB[0-9]_[0-9]+]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[IF]]: ; %if @@ -438,11 +445,12 @@ ; GCN: ;;#ASMEND ; GCN: s_cbranch_{{vccz|vccnz}} [[RET:BB[0-9]+_[0-9]+]] -; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop +; GCN-NEXT: {{BB[0-9]+_[0-9]+}}: ; %loop ; GCN-NEXT: ; in Loop: Header=[[LOOP_BODY]] Depth=1 ; GCN-NEXT: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} -; GCN-NEXT: s_sub_u32 s[[PC_LO]], s[[PC_LO]], ([[LONGBB]]+4)-[[LOOP_BODY]] -; GCN-NEXT: s_subb_u32 s[[PC_HI]], s[[PC_HI]], 0 +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LOOP_BODY]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LOOP_BODY]]-[[POST_GETPC]])>>32 ; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock @@ -479,9 +487,11 @@ ; GCN: s_cbranch_scc{{[0-1]}} [[LONG_BR_0:BB[0-9]+_[0-9]+]] ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: -; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-( -; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_setpc_b64 +; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} +; GCN-NEXT: [[POST_GETPC:.Lpost_getpc[0-9]+]]:{{$}} +; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], ([[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-[[POST_GETPC]])&4294967295 +; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], ([[LONG_BR_DEST0]]-[[POST_GETPC]])>>32 +; GCN-NEXT: s_setpc_b64 s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: [[LONG_BR_0]]: ; GCN: [[LONG_BR_DEST0]]: