diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3137,6 +3137,12 @@ return true; } +static bool isVectorOp(Instruction &I) { + return I.getType()->isVectorTy() || any_of(I.operands(), [](Use &U) { + return U->getType()->isVectorTy(); + }); +} + /// If this basic block is simple enough, and if a predecessor branches to us /// and one of our successors, fold the block into the predecessor and use /// logical operations to pick the right destination. @@ -3221,23 +3227,31 @@ // number of the bonus instructions we'll need to create when cloning into // each predecessor does not exceed a certain threshold. unsigned NumBonusInsts = 0; + unsigned VectorBonusThreshold = 0; const unsigned PredCount = Preds.size(); for (Instruction &I : *BB) { // Don't check the branch condition comparison itself. if (&I == Cond) continue; - // Ignore dbg intrinsics, and the terminator. - if (isa(I) || isa(I)) + // Ignore the terminator. + if (isa(I)) continue; // I must be safe to execute unconditionally. if (!isSafeToSpeculativelyExecute(&I)) return false; + // Ignore free instructions. + if (TTI && TTI->getUserCost(&I, CostKind) == TargetTransformInfo::TCC_Free) + continue; + + // Add a bonus if we see vector instructions. + if (isVectorOp(I)) + VectorBonusThreshold = 2; // Account for the cost of duplicating this instruction into each // predecessor. NumBonusInsts += PredCount; // Early exits once we reach the limit. - if (NumBonusInsts > BonusInstThreshold) + if (NumBonusInsts > BonusInstThreshold + VectorBonusThreshold) return false; } diff --git a/llvm/test/CodeGen/AArch64/csr-split.ll b/llvm/test/CodeGen/AArch64/csr-split.ll --- a/llvm/test/CodeGen/AArch64/csr-split.ll +++ b/llvm/test/CodeGen/AArch64/csr-split.ll @@ -82,22 +82,22 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: cbz x0, .LBB1_2 -; CHECK-NEXT: // %bb.1: // %if.end +; CHECK-NEXT: cbz x0, .LBB1_3 +; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: adrp x8, a ; CHECK-NEXT: ldrsw x8, [x8, :lo12:a] ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: cmp x8, x0 -; CHECK-NEXT: b.eq .LBB1_3 -; CHECK-NEXT: .LBB1_2: // %return -; CHECK-NEXT: mov w0, wzr -; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_3: // %if.then2 +; CHECK-NEXT: b.ne .LBB1_3 +; CHECK-NEXT: // %bb.2: // %if.then2 ; CHECK-NEXT: bl callVoid ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b callNonVoid +; CHECK-NEXT: .LBB1_3: // %return +; CHECK-NEXT: mov w0, wzr +; CHECK-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret ; ; CHECK-APPLE-LABEL: test2: ; CHECK-APPLE: ; %bb.0: ; %entry @@ -108,26 +108,26 @@ ; CHECK-APPLE-NEXT: .cfi_offset w29, -16 ; CHECK-APPLE-NEXT: .cfi_offset w19, -24 ; CHECK-APPLE-NEXT: .cfi_offset w20, -32 -; CHECK-APPLE-NEXT: cbz x0, LBB1_2 -; CHECK-APPLE-NEXT: ; %bb.1: ; %if.end +; CHECK-APPLE-NEXT: cbz x0, LBB1_3 +; CHECK-APPLE-NEXT: ; %bb.1: ; %entry ; CHECK-APPLE-NEXT: Lloh2: ; CHECK-APPLE-NEXT: adrp x8, _a@PAGE ; CHECK-APPLE-NEXT: Lloh3: ; CHECK-APPLE-NEXT: ldrsw x8, [x8, _a@PAGEOFF] ; CHECK-APPLE-NEXT: mov x19, x0 ; CHECK-APPLE-NEXT: cmp x8, x0 -; CHECK-APPLE-NEXT: b.eq LBB1_3 -; CHECK-APPLE-NEXT: LBB1_2: ; %return -; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload -; CHECK-APPLE-NEXT: mov w0, wzr -; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload -; CHECK-APPLE-NEXT: ret -; CHECK-APPLE-NEXT: LBB1_3: ; %if.then2 +; CHECK-APPLE-NEXT: b.ne LBB1_3 +; CHECK-APPLE-NEXT: ; %bb.2: ; %if.then2 ; CHECK-APPLE-NEXT: bl _callVoid ; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload ; CHECK-APPLE-NEXT: mov x0, x19 ; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload ; CHECK-APPLE-NEXT: b _callNonVoid +; CHECK-APPLE-NEXT: LBB1_3: ; %return +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: mov w0, wzr +; CHECK-APPLE-NEXT: ldp x20, x19, [sp], #32 ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: ret ; CHECK-APPLE-NEXT: .loh AdrpLdr Lloh2, Lloh3 entry: %tobool = icmp eq i32* %p1, null diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-logical.ll @@ -261,24 +261,22 @@ define float @test_separate_anyof_v4sf(<4 x float> %t) { ; CHECK-LABEL: @test_separate_anyof_v4sf( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[T:%.*]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[T]], i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[T]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[T]], i32 0 -; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T]] -; CHECK-NEXT: [[TMP4:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i1> [[TMP4]] to i4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i4 [[TMP5]], 0 -; CHECK-NEXT: [[CMP18:%.*]] = fcmp ogt float [[TMP3]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND3:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP18]] -; CHECK-NEXT: [[CMP23:%.*]] = fcmp ogt float [[TMP2]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND4:%.*]] = select i1 [[OR_COND3]], i1 true, i1 [[CMP23]] -; CHECK-NEXT: [[CMP28:%.*]] = fcmp ogt float [[TMP1]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND5:%.*]] = select i1 [[OR_COND4]], i1 true, i1 [[CMP28]] -; CHECK-NEXT: [[CMP33:%.*]] = fcmp ogt float [[TMP0]], 1.000000e+00 -; CHECK-NEXT: [[OR_COND6:%.*]] = select i1 [[OR_COND5]], i1 true, i1 [[CMP33]] -; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP3]], [[TMP2]] -; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND6]], float 0.000000e+00, float [[ADD]] +; CHECK-NEXT: [[T_FR:%.*]] = freeze <4 x float> [[T:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = fcmp olt <4 x float> [[T_FR]], zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4 +; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i4 [[TMP1]], 0 +; CHECK-NEXT: br i1 [[DOTNOT]], label [[IF_END:%.*]], label [[RETURN:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <4 x float> [[T_FR]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i1> [[TMP2]] to i4 +; CHECK-NEXT: [[DOTNOT7:%.*]] = icmp eq i4 [[TMP3]], 0 +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[T_FR]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[SHIFT]], [[T_FR]] +; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[DOTNOT7]], float [[ADD]], float 0.000000e+00 +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[IF_END]] ] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: @@ -353,20 +351,15 @@ ; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt <4 x i32> [[T_FR]], zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[TMP0]] to i4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i4 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] -; CHECK: lor.lhs.false: ; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <4 x i32> [[T_FR]], ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i1> [[TMP3]] to i4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i4 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[RETURN]], label [[IF_END:%.*]] -; CHECK: if.end: +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[TMP2]], [[TMP5]] ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[T_FR]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[T_FR]], [[SHIFT]] ; CHECK-NEXT: [[ADD:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[ADD]] to float -; CHECK-NEXT: br label [[RETURN]] -; CHECK: return: -; CHECK-NEXT: [[RETVAL_0:%.*]] = phi float [ [[CONV]], [[IF_END]] ], [ 0.000000e+00, [[LOR_LHS_FALSE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RETVAL_0:%.*]] = select i1 [[OR_COND]], float 0.000000e+00, float [[CONV]] ; CHECK-NEXT: ret float [[RETVAL_0]] ; entry: diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll --- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll +++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest-free-cost.ll @@ -8,12 +8,11 @@ define void @f(i8* %a, i8* %b, i1 %c, i1 %d, i1 %e) { ; CHECK-LABEL: @f( -; CHECK-NEXT: br i1 [[C:%.*]], label [[L1:%.*]], label [[L3:%.*]] -; CHECK: l1: ; CHECK-NEXT: [[A1:%.*]] = call i8* @llvm.strip.invariant.group.p0i8(i8* [[A:%.*]]) ; CHECK-NEXT: [[B1:%.*]] = call i8* @llvm.strip.invariant.group.p0i8(i8* [[B:%.*]]) ; CHECK-NEXT: [[I:%.*]] = icmp eq i8* [[A1]], [[B1]] -; CHECK-NEXT: br i1 [[I]], label [[L2:%.*]], label [[L3]] +; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[C:%.*]], i1 [[I]], i1 false +; CHECK-NEXT: br i1 [[OR_COND]], label [[L2:%.*]], label [[L3:%.*]] ; CHECK: l2: ; CHECK-NEXT: call void @g1() ; CHECK-NEXT: br label [[RET:%.*]]