Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -26,6 +26,16 @@ private: const SIRegisterInfo RI; + // The the inverse predicate should have the negative value. + enum BranchPredicate { + INVALID_BR = 0, + SCC_TRUE = 1, + SCC_FALSE = -1 + }; + + static unsigned getBranchOpcode(BranchPredicate Cond); + static BranchPredicate getBranchPredicate(unsigned Opcode); + unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, MachineOperand &SuperReg, @@ -136,6 +146,17 @@ unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; + bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const override; + + unsigned RemoveBranch(MachineBasicBlock &MBB) const override; + + unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, + MachineBasicBlock *FBB, ArrayRef Cond, + DebugLoc DL) const override; + bool areMemAccessesTriviallyDisjoint( MachineInstr *MIa, MachineInstr *MIb, AliasAnalysis *AA = nullptr) const override; @@ -493,7 +514,6 @@ ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const override; - }; namespace AMDGPU { Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1057,6 +1057,115 @@ return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } +unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { + switch (Cond) { + case SIInstrInfo::SCC_TRUE: + return AMDGPU::S_CBRANCH_SCC1; + case SIInstrInfo::SCC_FALSE: + return AMDGPU::S_CBRANCH_SCC0; + default: + llvm_unreachable("invalid branch predicate"); + } +} + +SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_CBRANCH_SCC0: + return SCC_FALSE; + case AMDGPU::S_CBRANCH_SCC1: + return SCC_TRUE; + default: + return INVALID_BR; + } +} + +bool SIInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, + MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + if (I == MBB.end()) + return false; + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + // Unconditional Branch + TBB = I->getOperand(0).getMBB(); + return false; + } + + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) + return true; + + MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + + ++I; + + if (I == MBB.end()) { + // Conditional branch followed by fall-through. + TBB = CondBB; + return false; + } + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + TBB = CondBB; + FBB = I->getOperand(0).getMBB(); + return false; + } + + return true; +} + +unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + unsigned Count = 0; + while (I != MBB.end()) { + MachineBasicBlock::iterator Next = std::next(I); + I->eraseFromParent(); + ++Count; + I = Next; + } + + return Count; +} + +unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef Cond, + DebugLoc DL) const { + + if (!FBB && Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(TBB); + return 1; + } + + assert(TBB && Cond[0].isImm()); + + unsigned Opcode + = getBranchOpcode(static_cast(Cond[0].getImm())); + + if (!FBB) { + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + return 1; + } + + assert(TBB && FBB); + + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(FBB); + + return 2; +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, Index: test/CodeGen/AMDGPU/ctpop64.ll =================================================================== --- test/CodeGen/AMDGPU/ctpop64.ll +++ test/CodeGen/AMDGPU/ctpop64.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s declare i64 @llvm.ctpop.i64(i64) nounwind readnone @@ -110,12 +110,9 @@ ret void } -; FIXME: We currently disallow SALU instructions in all branches, -; but there are some cases when the should be allowed. - ; FUNC-LABEL: {{^}}ctpop_i64_in_br: -; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd -; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 +; SI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd +; VI-DAG: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34 ; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}} ; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]] Index: test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll =================================================================== --- test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll +++ test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll @@ -3,19 +3,19 @@ ; Make sure that m0 is not reinitialized in the loop. ; GCN-LABEL: {{^}}copy_local_to_global_loop_m0_init: -; GCN: s_cbranch_scc1 BB0_2 +; GCN: s_cbranch_scc1 BB0_3 ; Initialize in preheader ; GCN: s_mov_b32 m0, -1 -; GCN: BB0_3: +; GCN: BB0_2: ; GCN: ds_read_b32 ; GCN: buffer_store_dword -; GCN: s_cbranch_vccnz BB0_2 -; GCN: s_branch BB0_3 +; GCN: s_cbranch_vccnz BB0_3 +; GCN: s_branch BB0_2 -; GCN: BB0_2: +; GCN: BB0_3: ; GCN-NEXT: s_endpgm define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 { bb: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare void @llvm.amdgcn.s.dcache.inv() #0 +declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; GCN-LABEL: {{^}}test_s_dcache_inv: ; GCN-NEXT: ; BB#0: @@ -15,10 +16,11 @@ ; GCN-LABEL: {{^}}test_s_dcache_inv_insert_wait: ; GCN-NEXT: ; BB#0: -; GCN-NEXT: s_dcache_inv -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding +; GCN: s_dcache_inv +; GCN: s_waitcnt lgkmcnt(0) ; encoding define void @test_s_dcache_inv_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv() + call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll @@ -2,6 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare void @llvm.amdgcn.s.dcache.inv.vol() #0 +declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; GCN-LABEL: {{^}}test_s_dcache_inv_vol: ; GCN-NEXT: ; BB#0: @@ -16,9 +17,10 @@ ; GCN-LABEL: {{^}}test_s_dcache_inv_vol_insert_wait: ; GCN-NEXT: ; BB#0: ; GCN-NEXT: s_dcache_inv_vol -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; encoding +; GCN: s_waitcnt lgkmcnt(0) ; encoding define void @test_s_dcache_inv_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.inv.vol() + call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s declare void @llvm.amdgcn.s.dcache.wb() #0 +declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; VI-LABEL: {{^}}test_s_dcache_wb: ; VI-NEXT: ; BB#0: @@ -14,9 +15,10 @@ ; VI-LABEL: {{^}}test_s_dcache_wb_insert_wait: ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb -; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding +; VI: s_waitcnt lgkmcnt(0) ; encoding define void @test_s_dcache_wb_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb() + call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end end: Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll @@ -1,6 +1,7 @@ ; RUN: llc -march=amdgcn -mcpu=fiji -show-mc-encoding < %s | FileCheck -check-prefix=VI %s declare void @llvm.amdgcn.s.dcache.wb.vol() #0 +declare void @llvm.amdgcn.s.waitcnt(i32) #0 ; VI-LABEL: {{^}}test_s_dcache_wb_vol: ; VI-NEXT: ; BB#0: @@ -14,9 +15,10 @@ ; VI-LABEL: {{^}}test_s_dcache_wb_vol_insert_wait: ; VI-NEXT: ; BB#0: ; VI-NEXT: s_dcache_wb_vol -; VI-NEXT: s_waitcnt lgkmcnt(0) ; encoding +; VI: s_waitcnt lgkmcnt(0) ; encoding define void @test_s_dcache_wb_vol_insert_wait() #0 { call void @llvm.amdgcn.s.dcache.wb.vol() + call void @llvm.amdgcn.s.waitcnt(i32 0) br label %end end: Index: test/CodeGen/AMDGPU/sgpr-control-flow.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s ; ; ; Most SALU instructions ignore control flow, so we need to make sure @@ -67,7 +67,7 @@ ; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]] ; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]] -; SI: BB2_1: +; SI: BB2_2: ; SI: buffer_load_dword [[AVAL:v[0-9]+]] ; SI: v_cmp_eq_i32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]] ; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]] Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -166,31 +166,70 @@ } -; SI-LABEL: {{^}}uniform_if_else: +; SI-LABEL: {{^}}uniform_if_else_ret: ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 -; SI: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]] +; SI-NEXT: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]] +; SI-NEXT: s_branch [[IF_LABEL:[0-9_A-Za-z]+]] + +; SI: [[ELSE_LABEL]]: +; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 +; SI: buffer_store_dword [[TWO]] +; SI: s_endpgm + +; SI: {{^}}[[IF_LABEL]]: ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 ; SI: buffer_store_dword [[ONE]] -; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]] +; SI: s_endpgm +define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) { +entry: + %cmp = icmp eq i32 %a, 0 + br i1 %cmp, label %if.then, label %if.else + +if.then: ; preds = %entry + store i32 1, i32 addrspace(1)* %out + br label %if.end + +if.else: ; preds = %entry + store i32 2, i32 addrspace(1)* %out + br label %if.end + +if.end: ; preds = %if.else, %if.then + ret void +} + +; SI-LABEL: {{^}}uniform_if_else: +; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0 +; SI-NEXT: s_cbranch_scc1 [[ELSE_LABEL:[0-9_A-Za-z]+]] +; SI-NEXT: s_branch [[IF_LABEL:[0-9_A-Za-z]+]] + ; SI: [[ELSE_LABEL]]: ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2 ; SI: buffer_store_dword [[TWO]] +; SI: s_branch [[ENDIF_LABEL:[0-9_A-Za-z]+]] + +; SI: [[IF_LABEL]]: +; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 +; SI: buffer_store_dword [[ONE]] + ; SI: [[ENDIF_LABEL]]: +; SI: v_mov_b32_e32 [[THREE:v[0-9]+]], 3 +; SI: buffer_store_dword [[THREE]] ; SI: s_endpgm -define void @uniform_if_else(i32 addrspace(1)* nocapture %out, i32 %a) { +define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) { entry: %cmp = icmp eq i32 %a, 0 br i1 %cmp, label %if.then, label %if.else if.then: ; preds = %entry - store i32 1, i32 addrspace(1)* %out + store i32 1, i32 addrspace(1)* %out0 br label %if.end if.else: ; preds = %entry - store i32 2, i32 addrspace(1)* %out + store i32 2, i32 addrspace(1)* %out0 br label %if.end if.end: ; preds = %if.else, %if.then + store i32 3, i32 addrspace(1)* %out1 ret void } @@ -368,15 +407,15 @@ ; SI-LABEL: {{^}}cse_uniform_condition_different_blocks: ; SI: s_load_dword [[COND:s[0-9]+]] ; SI: s_cmp_lt_i32 [[COND]], 1 -; SI: s_cbranch_scc1 BB13_3 +; SI: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3 ; SI: BB#1: ; SI-NOT: cmp ; SI: buffer_load_dword ; SI: buffer_store_dword -; SI: s_cbranch_scc1 BB13_3 +; SI: s_cbranch_scc1 BB[[FNNUM]]_3 -; SI: BB13_3: +; SI: BB[[FNNUM]]_3: ; SI: s_endpgm define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) { bb: Index: test/CodeGen/AMDGPU/valu-i1.ll =================================================================== --- test/CodeGen/AMDGPU/valu-i1.ll +++ test/CodeGen/AMDGPU/valu-i1.ll @@ -72,19 +72,18 @@ ; SI: v_cmp_ne_i32_e32 vcc, 0, v{{[0-9]+}} ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]] -; SI: s_cbranch_execz BB2_2 +; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] -; SI: ; BB#1: ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}} -; SI: BB2_3: +; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword ; SI-DAG: buffer_store_dword ; SI-DAG: v_cmp_eq_i32_e32 vcc, ; SI-DAG: s_and_b64 vcc, exec, vcc -; SI: s_cbranch_vccnz BB2_2 -; SI: s_branch BB2_3 -; SI: BB2_2: +; SI: s_cbranch_vccnz [[LABEL_EXIT]] +; SI: s_branch [[LABEL_LOOP]] +; SI: [[LABEL_EXIT]]: ; SI: s_endpgm define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 { @@ -117,7 +116,7 @@ ; SI: v_cmp_lt_i32_e32 vcc ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]] -; SI: s_cbranch_execz BB3_2 +; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]] ; Initialize inner condition to false ; SI: ; BB#1: @@ -125,7 +124,7 @@ ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]] ; Clear exec bits for workitems that load -1s -; SI: BB3_3: +; SI: [[LABEL_LOOP:BB[0-9]+_[0-9]+]]: ; SI: buffer_load_dword [[B:v[0-9]+]] ; SI: buffer_load_dword [[A:v[0-9]+]] ; SI-DAG: v_cmp_ne_i32_e64 [[NEG1_CHECK_0:s\[[0-9]+:[0-9]+\]]], -1, [[A]] @@ -133,23 +132,23 @@ ; SI: s_and_b64 [[ORNEG1:s\[[0-9]+:[0-9]+\]]], [[NEG1_CHECK_1]], [[NEG1_CHECK_0]] ; SI: s_and_saveexec_b64 [[ORNEG2:s\[[0-9]+:[0-9]+\]]], [[ORNEG1]] ; SI: s_xor_b64 [[ORNEG2]], exec, [[ORNEG2]] -; SI: s_cbranch_execz BB3_5 +; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]] -; SI: BB#4: +; SI: BB#3: ; SI: buffer_store_dword ; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]] ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]] -; SI: BB3_5: +; SI: [[LABEL_FLOW]]: ; SI: s_or_b64 exec, exec, [[ORNEG2]] ; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]] ; SI: s_andn2_b64 exec, exec, [[COND_STATE]] -; SI: s_cbranch_execnz BB3_3 +; SI: s_cbranch_execnz [[LABEL_LOOP]] -; SI: BB#6 +; SI: BB#5 ; SI: s_or_b64 exec, exec, [[COND_STATE]] -; SI: BB3_2: +; SI: [[LABEL_EXIT]]: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm