Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -437,6 +437,10 @@ /// See PR14725. void fixLCSSAPHIs(); + /// Iteratively sink the scalarized operands of a predicated instruction into + /// the block that was created for it. + void sinkScalarOperands(Instruction *PredInst); + /// Predicate conditional instructions that require predication on their /// respective conditions. void predicateInstructions(); @@ -4248,15 +4252,76 @@ } } +void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) { + + // The basic block and loop containing the predicated instruction. + auto *PredBB = PredInst->getParent(); + auto *VectorLoop = LI->getLoopFor(PredBB); + + // Initialize a worklist with the operands of the predicated instruction. + SetVector Worklist(PredInst->op_begin(), PredInst->op_end()); + + // Returns true if a given use occurs in the predicated block. Phi nodes use + // their operands in their corresponding predecessor blocks. + auto isBlockOfUsePredicated = [&](Use &U) -> bool { + auto *I = cast(U.getUser()); + BasicBlock *BB = I->getParent(); + if (auto *Phi = dyn_cast(I)) + BB = Phi->getIncomingBlock( + PHINode::getIncomingValueNumForOperand(U.getOperandNo())); + return BB == PredBB; + }; + + // Sink the scalarized operands of the predicated instruction into the block + // we created for it. This is an iterative, breadth-first algorithm. When an + // instruction is sunk, it's operands are then added to the worklist. The + // algorithm ends after one pass through the worklist doesn't sink a single + // instruction. We index the worklist to avoid iterator invalidations. + bool Changed; + do { + Changed = false; + for (unsigned Idx = 0; Idx < Worklist.size();) { + auto *I = dyn_cast(Worklist[Idx]); + + // We can't sink an instruction if it may have side effects, is not in + // the loop, or is already in the predicated block. Remove the + // instruction from the worklist. + if (!I || I->mayHaveSideEffects() || !VectorLoop->contains(I) || + I->getParent() == PredBB) { + Worklist.remove(I); + continue; + } + + // It's legal to sink the instruction if all its uses occur in the + // predicated block. Otherwise, there's nothing to do. + if (!all_of(I->uses(), isBlockOfUsePredicated)) { + ++Idx; + continue; + } + + // Move the instruction to the beginning of the predicated block. We + // remove the instruction from the worklist, and then add its operands. + I->moveBefore(PredBB->getFirstNonPHI()); + Worklist.remove(I); + Worklist.insert(I->op_begin(), I->op_end()); + + // The sinking have have enabled other instructions to be sunk, so we + // will need to iterate. + Changed = true; + } + } while (Changed); +} + void InnerLoopVectorizer::predicateInstructions() { // For each instruction I marked for predication on value C, split I into its - // own basic block to form an if-then construct over C. - // Since I may be fed by extractelement and/or be feeding an insertelement - // generated during scalarization we try to move such instructions into the - // predicated basic block as well. For the insertelement this also means that - // the PHI will be created for the resulting vector rather than for the - // scalar instruction. + // own basic block to form an if-then construct over C. Since I may be fed by + // an extractelement instruction or other scalar operand, we try to + // iteratively sink its scalar operands into the predicated block. If I feeds + // an insertelement instruction, we try to move this instruction into the + // predicated block as well. For non-void types, a phi node will be created + // for the resulting value (either vector or scalar). + // // So for some predicated instruction, e.g. the conditional sdiv in: // // for.body: @@ -4330,13 +4395,7 @@ auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT, LI); I->moveBefore(T); - // Try to move any extractelement we may have created for the predicated - // instruction into the Then block. - for (Use &Op : I->operands()) { - auto *OpInst = dyn_cast(&*Op); - if (OpInst && OpInst->hasOneUse()) // TODO: more accurately - hasOneUser() - OpInst->moveBefore(&*I); - } + sinkScalarOperands(&*I); I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); Index: test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll =================================================================== --- test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -200,15 +200,15 @@ ; INTER-NOT: LV: Found uniform instruction: %tmp0 = getelementptr inbounds %pair, %pair* %p, i64 %i, i32 0 ; INTER: vector.body ; INTER: %index = phi i64 [ 0, %vector.ph ], [ %index.next, {{.*}} ] -; INTER: %[[I1:.+]] = or i64 %index, 1 -; INTER: %[[I2:.+]] = or i64 %index, 2 -; INTER: %[[I3:.+]] = or i64 %index, 3 ; INTER: %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %index, i32 0 +; INTER: %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>* +; INTER: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8 +; INTER: %[[I1:.+]] = or i64 %index, 1 ; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I1]], i32 0 +; INTER: %[[I2:.+]] = or i64 %index, 2 ; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I2]], i32 0 +; INTER: %[[I3:.+]] = or i64 %index, 3 ; INTER: getelementptr inbounds %pair, %pair* %p, i64 %[[I3]], i32 0 -; INTER: %[[B0:.+]] = bitcast i32* %[[G0]] to <8 x i32>* -; INTER: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 8 ; INTER: br i1 {{.*}}, label %middle.block, label %vector.body ; define void @predicated_store(%pair *%p, i32 %x, i64 %n) { Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -23,7 +23,7 @@ ; CHECK: [[CSD]]: ; CHECK: %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]] +; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA1]], %[[SDA0]] ; CHECK: %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0 ; CHECK: br label %[[ESD]] ; CHECK: [[ESD]]: @@ -34,7 +34,7 @@ ; CHECK: [[CSDH]]: ; CHECK: %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 ; CHECK: %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 -; CHECK: %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]] +; CHECK: %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA1H]], %[[SDA0H]] ; CHECK: %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1 ; CHECK: br label %[[ESDH]] ; CHECK: [[ESDH]]: @@ -46,7 +46,7 @@ ; CHECK: [[CUD]]: ; CHECK: %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]] +; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA1]], %[[UDA0]] ; CHECK: %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0 ; CHECK: br label %[[EUD]] ; CHECK: [[EUD]]: @@ -58,7 +58,7 @@ ; CHECK: [[CSR]]: ; CHECK: %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]] +; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA1]], %[[SRA0]] ; CHECK: %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0 ; CHECK: br label %[[ESR]] ; CHECK: [[ESR]]: @@ -70,7 +70,7 @@ ; CHECK: [[CUR]]: ; CHECK: %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 ; CHECK: %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]] +; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA1]], %[[URA0]] ; CHECK: %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0 ; CHECK: br label %[[EUR]] ; CHECK: [[EUR]]: Index: test/Transforms/LoopVectorize/if-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-stores.ll +++ test/Transforms/LoopVectorize/if-pred-stores.ll @@ -11,9 +11,6 @@ ; VEC-LABEL: test ; VEC: %[[v0:.+]] = add i64 %index, 0 -; VEC: %[[v1:.+]] = add i64 %index, 1 -; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] -; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] ; VEC: %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, ; VEC: %[[v9:.+]] = add nsw <2 x i32> %{{.*}}, ; VEC: %[[v10:.+]] = and <2 x i1> %[[v8]], @@ -23,6 +20,7 @@ ; VEC: br i1 %[[v12]], label %[[cond:.+]], label %[[else:.+]] ; ; VEC: [[cond]]: +; VEC: %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]] ; VEC: %[[v13:.+]] = extractelement <2 x i32> %[[v9]], i32 0 ; VEC: store i32 %[[v13]], i32* %[[v2]], align 4 ; VEC: br label %[[else:.+]] @@ -33,6 +31,8 @@ ; VEC: br i1 %[[v16]], label %[[cond2:.+]], label %[[else2:.+]] ; ; VEC: [[cond2]]: +; VEC: %[[v1:.+]] = add i64 %index, 1 +; VEC: %[[v4:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v1]] ; VEC: %[[v17:.+]] = extractelement <2 x i32> %[[v9]], i32 1 ; VEC: store i32 %[[v17]], i32* %[[v4]], align 4 ; VEC: br label %[[else2:.+]] @@ -49,14 +49,13 @@ ; UNROLL: %[[v3:[a-zA-Z0-9]+]] = load i32, i32* %[[v1]], align 4 ; UNROLL: %[[v4:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v2]], 100 ; UNROLL: %[[v5:[a-zA-Z0-9]+]] = icmp sgt i32 %[[v3]], 100 -; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20 -; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20 ; UNROLL: %[[o1:[a-zA-Z0-9]+]] = or i1 false, %[[v4]] ; UNROLL: %[[o2:[a-zA-Z0-9]+]] = or i1 false, %[[v5]] ; UNROLL: %[[v8:[a-zA-Z0-9]+]] = icmp eq i1 %[[o1]], true ; UNROLL: br i1 %[[v8]], label %[[cond:[a-zA-Z0-9.]+]], label %[[else:[a-zA-Z0-9.]+]] ; ; UNROLL: [[cond]]: +; UNROLL: %[[v6:[a-zA-Z0-9]+]] = add nsw i32 %[[v2]], 20 ; UNROLL: store i32 %[[v6]], i32* %[[v0]], align 4 ; UNROLL: br label %[[else]] ; @@ -65,6 +64,7 @@ ; UNROLL: br i1 %[[v9]], label %[[cond2:[a-zA-Z0-9.]+]], label %[[else2:[a-zA-Z0-9.]+]] ; ; UNROLL: [[cond2]]: +; UNROLL: %[[v7:[a-zA-Z0-9]+]] = add nsw i32 %[[v3]], 20 ; UNROLL: store i32 %[[v7]], i32* %[[v1]], align 4 ; UNROLL: br label %[[else2]] ; Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -303,58 +303,58 @@ ; CHECK: vector.body: ; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ] ; CHECK: %[[I0:.+]] = add i32 %index, 0 -; CHECK: %[[I1:.+]] = add i32 %index, 1 ; CHECK: getelementptr inbounds i32, i32* %a, i32 %[[I0]] ; CHECK: pred.udiv.if: ; CHECK: udiv i32 {{.*}}, %[[I0]] ; CHECK: pred.udiv.if1: +; CHECK: %[[I1:.+]] = add i32 %index, 1 ; CHECK: udiv i32 {{.*}}, %[[I1]] ; ; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05( ; UNROLL-NO-IC: vector.body: ; UNROLL-NO-IC: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ] ; UNROLL-NO-IC: %[[I0:.+]] = add i32 %index, 0 -; UNROLL-NO-IC: %[[I1:.+]] = add i32 %index, 1 ; UNROLL-NO-IC: %[[I2:.+]] = add i32 %index, 2 -; UNROLL-NO-IC: %[[I3:.+]] = add i32 %index, 3 ; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I0]] ; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I2]] ; UNROLL-NO-IC: pred.udiv.if: ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I0]] ; UNROLL-NO-IC: pred.udiv.if6: +; UNROLL-NO-IC: %[[I1:.+]] = add i32 %index, 1 ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I1]] ; UNROLL-NO-IC: pred.udiv.if8: ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I2]] ; UNROLL-NO-IC: pred.udiv.if10: +; UNROLL-NO-IC: %[[I3:.+]] = add i32 %index, 3 ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I3]] ; ; IND-LABEL: @scalarize_induction_variable_05( ; IND: vector.body: ; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ] -; IND: %[[I1:.+]] = or i32 %index, 1 ; IND: %[[E0:.+]] = sext i32 %index to i64 ; IND: getelementptr inbounds i32, i32* %a, i64 %[[E0]] ; IND: pred.udiv.if: ; IND: udiv i32 {{.*}}, %index ; IND: pred.udiv.if1: +; IND: %[[I1:.+]] = or i32 %index, 1 ; IND: udiv i32 {{.*}}, %[[I1]] ; ; UNROLL-LABEL: @scalarize_induction_variable_05( ; UNROLL: vector.body: ; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ] -; UNROLL: %[[I1:.+]] = or i32 %index, 1 ; UNROLL: %[[I2:.+]] = or i32 %index, 2 -; UNROLL: %[[I3:.+]] = or i32 %index, 3 ; UNROLL: %[[E0:.+]] = sext i32 %index to i64 ; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]] ; UNROLL: getelementptr i32, i32* %[[G0]], i64 2 ; UNROLL: pred.udiv.if: ; UNROLL: udiv i32 {{.*}}, %index ; UNROLL: pred.udiv.if6: +; UNROLL: %[[I1:.+]] = or i32 %index, 1 ; UNROLL: udiv i32 {{.*}}, %[[I1]] ; UNROLL: pred.udiv.if8: ; UNROLL: udiv i32 {{.*}}, %[[I2]] ; UNROLL: pred.udiv.if10: +; UNROLL: %[[I3:.+]] = or i32 %index, 3 ; UNROLL: udiv i32 {{.*}}, %[[I3]] define i32 @scalarize_induction_variable_05(i32* %a, i32 %x, i1 %c, i32 %n) {