Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -166,6 +166,7 @@ SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performBrcondCombine(SDNode *N, DAGCombinerInfo &DCI) const; bool isLegalFlatAddressingMode(const AddrMode &AM) const; bool isLegalGlobalAddressingMode(const AddrMode &AM) const; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -680,6 +680,7 @@ setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::BRCOND); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -8634,6 +8635,37 @@ return SDValue(CSrc, 0); } +SDValue SITargetLowering::performBrcondCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + // Fold brcond (setcc zext(i1 x), 1, ne), label -> brcond x, label + SDValue CC = N->getOperand(1); + if (CC.getOpcode() != ISD::SETCC || !isa(CC.getOperand(1))) + return SDValue(); + + uint64_t CVal = CC.getConstantOperandVal(1); + ISD::CondCode Code = cast(CC.getOperand(2))->get(); + switch (CC.getOperand(0).getOpcode()) { + case ISD::ZERO_EXTEND: + if (!(CVal == 1 && Code == ISD::SETNE) && + !(CVal == 0 && Code == ISD::SETEQ)) + return SDValue(); + break; + case ISD::SIGN_EXTEND: + if (!(CVal == (uint64_t)-1 && Code == ISD::SETNE) && + !(CVal == 0 && Code == ISD::SETEQ)) + return SDValue(); + break; + default: + return SDValue(); + } + + SDValue OrigCC = CC.getOperand(0).getOperand(0); + if (OrigCC.getValueType() != MVT::i1) + return SDValue(); + + return DCI.DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, N->getOperand(0), + OrigCC, N->getOperand(2)); +} SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -8760,6 +8792,8 @@ return performExtractVectorEltCombine(N, DCI); case ISD::INSERT_VECTOR_ELT: return performInsertVectorEltCombine(N, DCI); + case ISD::BRCOND: + return performBrcondCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: test/CodeGen/AMDGPU/dag-combine-brcond.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/dag-combine-brcond.ll @@ -0,0 +1,52 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN: BB0_1: +; GCN: v_cmp_eq_u32_e64 [[CC:[^,]+]], s{{[0-9]+}}, 0 +; GCN: BB0_2: +; GCN-NOT: v_cndmask_b32 +; GCN-NOT: v_cmp +; GCN: s_and_b64 vcc, exec, [[CC]] +; GCN: s_cbranch_vccz BB0_4 +define amdgpu_kernel void @test(float addrspace(1)* %arg1) { +bb: + %tmp36 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 undef + %tmp37 = getelementptr inbounds float, float addrspace(1)* %tmp36, i64 undef + br label %bb41 + +bb40: ; preds = %bb111 + ret void + +bb41: ; preds = %bb111, %bb + %tmp42 = phi i32 [ 0, %bb ], [ %tmp112, %bb111 ] + %tmp106 = icmp eq i32 %tmp42, 0 + br label %bb114 + +bb111: ; preds = %bb150 + %tmp112 = add nuw nsw i32 %tmp42, 1 + %tmp113 = icmp eq i32 %tmp112, 32 + br i1 %tmp113, label %bb40, label %bb41 + +bb114: ; preds = %bb150, %bb41 + %tmp115 = phi i32 [ 0, %bb41 ], [ %tmp326, %bb150 ] + %tmp116 = shl i32 %tmp115, 5 + %tmp122 = zext i32 %tmp116 to i64 + %tmp123 = getelementptr inbounds float, float addrspace(1)* %tmp37, i64 %tmp122 + br i1 %tmp106, label %bb125, label %bb132 + +bb125: ; preds = %bb114 + %tmp128 = getelementptr inbounds float, float addrspace(1)* %tmp123, i64 16 + %tmp129 = bitcast float addrspace(1)* %tmp128 to <4 x float> addrspace(1)* + br label %bb150 + +bb132: ; preds = %bb114 + %tmp140 = getelementptr inbounds float, float addrspace(1)* %tmp123, i64 16 + %tmp141 = bitcast float addrspace(1)* %tmp140 to <4 x float> addrspace(1)* + br label %bb150 + +bb150: ; preds = %bb132, %bb125 + %tmp152 = phi <4 x float> addrspace(1)* [ %tmp129, %bb125 ], [ %tmp141, %bb132 ] + store <4 x float> zeroinitializer, <4 x float> addrspace(1)* %tmp152, align 16 + %tmp326 = add nuw nsw i32 %tmp115, 2 + %tmp327 = icmp eq i32 %tmp326, 32 + br i1 %tmp327, label %bb111, label %bb114 +} Index: test/CodeGen/AMDGPU/multi-divergent-exit-region.ll =================================================================== --- test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -367,7 +367,7 @@ ; GCN: v_cmp_ne_u32_e32 vcc, 7, v0 ; GCN: {{^}}[[FLOW]]: -; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]] +; GCN: s_cbranch_execnz [[FLOW1:BB[0-9]+]] ; GCN: s_or_b64 exec, exec ; GCN: v_mov_b32_e32 v0, 2.0