Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -504,6 +504,11 @@ this->GISel.reset(&GISel); } + // XXX - Why is this here if it isn't in the default pass set? + bool enableEarlyIfConversion() const override { + return true; + } + void overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const override; Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -48,6 +48,11 @@ cl::ReallyHidden, cl::init(true)); +static cl::opt +EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden, + cl::desc("Run early if-conversion"), + cl::init(true)); + static cl::opt EnableR600IfConvert( "r600-if-convert", cl::desc("Use if conversion pass"), @@ -334,6 +339,7 @@ void addIRPasses() override; bool addPreISel() override; void addMachineSSAOptimization() override; + bool addILPOpts() override; bool addInstSelector() override; #ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; @@ -526,6 +532,14 @@ addPass(&SILoadStoreOptimizerID); } +bool GCNPassConfig::addILPOpts() { + if (EnableEarlyIfConversion) + addPass(&EarlyIfConverterID); + + TargetPassConfig::addILPOpts(); + return false; +} + void GCNPassConfig::addIRPasses() { // TODO: May want to move later or split into an early and late one. addPass(createAMDGPUCodeGenPreparePass(&getGCNTargetMachine())); Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -203,6 +203,18 @@ bool reverseBranchCondition( SmallVectorImpl &Cond) const override; + + bool canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const override; + + void insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg) const override; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1290,6 +1290,13 @@ return Count; } +// Copy the flags onto the implicit condition register operand. +static void preserveCondRegFlags(MachineOperand &CondReg, + const MachineOperand &OrigCond) { + CondReg.setIsUndef(OrigCond.isUndef()); + CondReg.setIsKill(OrigCond.isKill()); +} + unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -1317,9 +1324,7 @@ .addMBB(TBB); // Copy the flags onto the implicit condition register operand. - MachineOperand &CondReg = CondBr->getOperand(1); - CondReg.setIsUndef(Cond[1].isUndef()); - CondReg.setIsKill(Cond[1].isKill()); + preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); if (BytesAdded) *BytesAdded = 4; @@ -1351,6 +1356,136 @@ return false; } +bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const { + switch (Cond[0].getImm()) { + case VCCNZ: + case VCCZ: { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + + // Limit to equal cost for branch vs. N v_cndmask_b32s. + return !RI.isSGPRClass(RC) && NumInsts <= 6; + } + case SCC_TRUE: + case SCC_FALSE: { + // FIXME: We could insert for VGPRs if we could replace the original compare + // with a vector one. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + // Multiples of 8 can do s_cselect_b64 + if (NumInsts % 2 == 0) + NumInsts /= 2; + + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + return RI.isSGPRClass(RC); + } + default: + return false; + } +} + +void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg) const { + BranchPredicate Pred = static_cast(Cond[0].getImm()); + if (Pred == VCCZ || Pred == SCC_FALSE) { + Pred = static_cast(-Pred); + std::swap(TrueReg, FalseReg); + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + unsigned DstSize = DstRC->getSize(); + + if (DstSize == 4) { + unsigned SelOp = Pred == SCC_TRUE ? + AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; + + // Instruction's operands are backwards from what is expected. + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + if (DstSize == 8 && Pred == SCC_TRUE) { + MachineInstr *Select = + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; + const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; + const int16_t *SubIndices = Sub0_15; + int NElts = DstSize / 4; + + // 64-bit select is only avaialble for SALU. + if (Pred == SCC_TRUE) { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + + assert(NElts % 2 == 0); + NElts /= 2; + } + + MachineInstrBuilder MIB = BuildMI( + MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); + + I = MIB->getIterator(); + + SmallVector Regs; + for (int Idx = 0; Idx != NElts; ++Idx) { + unsigned DstElt = MRI.createVirtualRegister(EltRC); + Regs.push_back(DstElt); + + unsigned SubIdx = SubIndices[Idx]; + + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + + MIB.addReg(DstElt) + .addImm(SubIdx); + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, Index: lib/Target/AMDGPU/SISchedule.td =================================================================== --- lib/Target/AMDGPU/SISchedule.td +++ lib/Target/AMDGPU/SISchedule.td @@ -53,6 +53,11 @@ let MicroOpBufferSize = 1; let IssueWidth = 1; let PostRAScheduler = 1; + + // FIXME:Approximate 2 * branch cost. Try to hack around bad + // early-ifcvt heuristics. These need improvement to avoid the OOE + // heuristics. + int MispredictPenalty = 20; } def SIFullSpeedModel : SISchedMachineModel; Index: test/CodeGen/AMDGPU/early-if-convert-cost.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/early-if-convert-cost.ll @@ -0,0 +1,110 @@ +; RUN: llc -stress-early-ifcvt -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: Most of these cases that don't trigger because of broken cost +; heuristics. Should not need -stress-early-ifcvt + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle64: +; GCN: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GCN: v_cmp_neq_f64_e32 vcc, 1.0, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN: v_add_f64 v{{\[}}[[ADD_LO:[0-9]+]]:[[ADD_HI:[0-9]+]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}, v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}} +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc +; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc +; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { +entry: + %v = load double, double addrspace(1)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle64: +; GCN: v_cmp_neq_f64 +; GCN: v_add_f64 +; GCN: v_cndmask_b32_e32 +; GCN: v_cndmask_b32_e32 +define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 { +entry: + %v = load double, double addrspace(2)* %in + %cc = fcmp oeq double %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd double %v, %v + br label %endif + +endif: + %r = phi double [ %v, %entry ], [ %u, %if ] + store double %r, double addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle96: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN-DAG: buffer_store_dword v +; GCN-DAG: buffer_store_dwordx2 +define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + store <3 x i32> %r, <3 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle128: +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 + +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: v_add_i32_e32 +; GCN: s_mov_b64 vcc, [[CMP]] + +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc + +; GCN: buffer_store_dwordx4 +define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + store <4 x i32> %r, <4 x i32> addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/early-if-convert.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/early-if-convert.ll @@ -0,0 +1,454 @@ +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; FIXME: This leaves behind a now unnecessary and with exec + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_diamond: +; GCN: buffer_load_dword [[VAL:v[0-9]+]] +; GCN: v_cmp_neq_f32_e32 vcc, 1.0, [[VAL]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]] +; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc +; GCN: buffer_store_dword [[RESULT]] +define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %else + +if: + %u0 = fadd float %v, %v + br label %endif + +else: + %u1 = fmul float %v, %v + br label %endif + +endif: + %r = phi float [ %u0, %if ], [ %u1, %else ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_vcc_clobber: +; GCN: ; clobber vcc +; GCN: v_cmp_neq_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0 +; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc +; GCN: s_mov_b64 vcc, [[CMP]] +; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc +define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 { +entry: + %v = load i32, i32 addrspace(1)* %in + %cc = fcmp oeq float %k, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + call void asm "; clobber $0", "~{VCC}"() #0 + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void +} + +; Longest chain of cheap instructions to convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_max_cheap: +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_cndmask_b32_e32 +define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.8, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Short chain of cheap instructions to not convert +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_min_expensive: +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 +; GCN: v_mul_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u.0 = fmul float %v, %v + %u.1 = fmul float %v, %u.0 + %u.2 = fmul float %v, %u.1 + %u.3 = fmul float %v, %u.2 + %u.4 = fmul float %v, %u.3 + %u.5 = fmul float %v, %u.4 + %u.6 = fmul float %v, %u.5 + %u.7 = fmul float %v, %u.6 + %u.8 = fmul float %v, %u.7 + %u.9 = fmul float %v, %u.8 + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u.9, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Should still branch over fdiv expansion +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_expensive: +; GCN: v_cmp_neq_f32_e32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_div_scale_f32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fdiv float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; vcc branch with SGPR inputs +; GCN-LABEL: {{^}}test_vccnz_sgpr_ifcvt_triangle: +; GCN: v_cmp_neq_f32_e64 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: s_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + store i32 %r, i32 addrspace(1)* %out + ret void + +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load: +; GCN: v_cndmask_b32 +define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 { +entry: + %v = load float, float addrspace(2)* %in + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Due to broken cost heuristic, this is not if converted like +; test_vccnz_ifcvt_triangle_constant_load even though it should be. + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload: +; GCN: v_cndmask_b32 +define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 { +entry: + %cc = fcmp oeq float %v, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; Scalar branch and scalar inputs +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle: +; GCN: s_load_dword [[VAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0 +; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]] +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]] +define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i32, i32 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i32 %v, %v + br label %endif + +endif: + %r = phi i32 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i32 %r) #0 + ret void +} + +; FIXME: Should be able to use VALU compare and select +; Scalar branch but VGPR select operands +; GCN-LABEL: {{^}}test_scc1_vgpr_ifcvt_triangle: +; GCN: s_cmp_lg_u32 +; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_f32_e32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 { +entry: + %v = load float, float addrspace(1)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = fadd float %v, %v + br label %endif + +endif: + %r = phi float [ %v, %entry ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle64: +; GCN: s_add_u32 +; GCN: s_addc_u32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load i64, i64 addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add i64 %v, %v + br label %endif + +endif: + %r = phi i64 [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(i64 %r) #0 + ret void +} + +; TODO: Can do s_cselect_b64; s_cselect_b32 +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle96: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <3 x i32>, <3 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <3 x i32> %v, %v + br label %endif + +endif: + %r = phi <3 x i32> [ %v, %entry ], [ %u, %if ] + %r.ext = shufflevector <3 x i32> %r, <3 x i32> undef, <4 x i32> + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r.ext) #0 + ret void +} + +; GCN-LABEL: {{^}}test_scc1_sgpr_ifcvt_triangle128: +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_add_i32 +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1 +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 { +entry: + %v = load <4 x i32>, <4 x i32> addrspace(2)* %in + %cc = icmp eq i32 %cond, 1 + br i1 %cc, label %if, label %endif + +if: + %u = add <4 x i32> %v, %v + br label %endif + +endif: + %r = phi <4 x i32> [ %v, %entry ], [ %u, %if ] + call void asm sideeffect "; reg use $0", "s"(<4 x i32> %r) #0 + ret void +} + +; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select: +; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0 +; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}} +define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) { +entry: + %cmp0 = icmp eq i32 %cond, 0 + br i1 %cmp0, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}ifcvt_undef_scc: +; GCN: {{^}}; BB#0: +; GCN-NEXT: s_load_dwordx2 +; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0 +define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) { +entry: + br i1 undef, label %else, label %if + +if: + br label %done + +else: + br label %done + +done: + %value = phi i32 [0, %if], [1, %else] + store i32 %value, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle256: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <8 x i32>, <8 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <8 x i32> %v, %v + br label %endif + +endif: + %r = phi <8 x i32> [ %v, %entry ], [ %u, %if ] + store <8 x i32> %r, <8 x i32> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle512: +; GCN: v_cmp_neq_f32 +; GCN: s_cbranch_vccnz [[ENDIF:BB[0-9]+_[0-9]+]] + +; GCN: v_add_i32 +; GCN: v_add_i32 + +; GCN: [[ENDIF]]: +; GCN: buffer_store_dword +define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 { +entry: + %v = load <16 x i32>, <16 x i32> addrspace(1)* %in + %cc = fcmp oeq float %cnd, 1.000000e+00 + br i1 %cc, label %if, label %endif + +if: + %u = add <16 x i32> %v, %v + br label %endif + +endif: + %r = phi <16 x i32> [ %v, %entry ], [ %u, %if ] + store <16 x i32> %r, <16 x i32> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll +++ test/CodeGen/AMDGPU/uniform-branch-intrinsic-cond.ll @@ -4,9 +4,9 @@ ; This used to raise an assertion due to how the choice between uniform and ; non-uniform branches was determined. ; -; CHECK-LABEL: {{^}}main: +; CHECK-LABEL: {{^}}test: ; CHECK: s_cbranch_vccnz -define amdgpu_ps float @main(<4 x i32> inreg %rsrc) { +define amdgpu_ps float @test(<4 x i32> inreg %rsrc) #0 { main_body: %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 true, i1 false) %cc = fcmp une float %v, 1.000000e+00 @@ -14,6 +14,7 @@ if: %u = fadd float %v, %v + call void asm sideeffect "", ""() #0 ; Prevent ifconversion br label %else else: @@ -21,7 +22,33 @@ ret float %r } +; FIXME: This leaves behind a now unnecessary and with exec + +; This version can be if converted +; CHECK-LABEL: {{^}}test_vcc_ifcvt: +; CHECK: buffer_load_dword [[VAL:v[0-9]+]] +; CHECK: v_cmp_eq_f32_e32 vcc, 1.0, [[VAL]] +; CHECK: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]] +; CHECK: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc +; CHECK: buffer_store_dword [[RESULT]] +define void @test_vcc_ifcvt(float addrspace(1)* %out, float addrspace(1)* %in) #0 { +main_body: + %v = load float, float addrspace(1)* %in + %cc = fcmp une float %v, 1.000000e+00 + br i1 %cc, label %if, label %else + +if: + %u = fadd float %v, %v + br label %else + +else: + %r = phi float [ %v, %main_body ], [ %u, %if ] + store float %r, float addrspace(1)* %out + ret void +} + ; Function Attrs: nounwind readonly -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -attributes #0 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/uniform-cfg.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-cfg.ll +++ test/CodeGen/AMDGPU/uniform-cfg.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=0 -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}uniform_if_scc: ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0