Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -313,6 +313,7 @@ private: bool eliminateFallThrough(Function &F); + bool moveHoistedInstrsBackIntoLoop(Function &F); bool eliminateMostlyEmptyBlocks(Function &F); BasicBlock *findDestBlockOfMergeableEmptyBlock(BasicBlock *BB); bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; @@ -413,6 +414,8 @@ } } + EverMadeChange |= moveHoistedInstrsBackIntoLoop(F); + // Eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= eliminateMostlyEmptyBlocks(F); @@ -577,6 +580,93 @@ return DestBB; } +// Go through all operands. If an operand is in BB, call this recursively, +// if it's in the preheader, add it to Worklist (if it's not already there) +// and add current instruction to Users (if it's not already there). +static void addPatternMembers(Instruction *I, BasicBlock *Preheader, BasicBlock* BB, + SmallVector &Users, + SmallVector &Worklist) { + for (User::op_iterator II = I->op_begin(), IE = I->op_end(); II != IE; ++II) { + auto *Op = dyn_cast(II); + if (!Op || Op->getOpcode() == Instruction::PHI) + continue; + if (Op->getParent() == BB) + addPatternMembers(Op, Preheader, BB, Users, Worklist); + else if (Op->getParent() == Preheader) { + if (std::find(Users.begin(), Users.end(), I) == Users.end()) + Users.push_back(I); + if (std::find(Worklist.begin(), Worklist.end(), Op) == Worklist.end()) + Worklist.push_back(Op); + } + } +} + +/// LCIM pass sometimes hoists a part of a pattern out of a loop +/// while leaving the rest inside which prevents combining. This function +/// moves hoisted instruction back if such a pattern is detected. +bool CodeGenPrepare::moveHoistedInstrsBackIntoLoop(Function &F) { + bool MadeChange = false; + SmallVector LoopList(LI->begin(), LI->end()); + while (!LoopList.empty()) { + Loop *L = LoopList.pop_back_val(); + BasicBlock *Preheader = L->getLoopPreheader(); + + for (BasicBlock *BB : L->getBlocks()) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Only handle patterns starting with either shifts or or(shl, lshr). + if (I->getOpcode() == Instruction::Or) { + Instruction *Op0 = dyn_cast(I->getOperand(0)); + Instruction *Op1 = dyn_cast(I->getOperand(1)); + if (!Op0 || !Op1 || Op0->getOpcode() == Op1->getOpcode() || + !Op0->isLogicalShift() || !Op1->isLogicalShift()) + continue; + } + else if (!I->isShift()) + continue; + + // In Users we store all the instructions that are inside of the loop + // and use instructions from the preheader. + // In Worklist there are instructions residing in the preheader + // used by instructions from worklist - potential parts of a pattern + // to be fetched back into the loop. + SmallVector Users; + SmallVector Worklist; + addPatternMembers(&*I, Preheader, BB, Users, Worklist); + SmallVector::iterator It, EIt; + // Keep going until the worklist is empty or there were no fetches + // in whole iteration. + Instruction *ToFetch; + while (true) { + for (It = Worklist.begin(), EIt = Worklist.end(); It != EIt; ++It) { + ToFetch = *It; + Value::user_iterator UI, EI; + bool SafeToFetch = true; + // If all users are inside of the loop and a part of the pattern, + // fetch current instruction and start new iteration. + for (UI = ToFetch->user_begin(), EI = ToFetch->user_end(); + UI != EI; ++UI) { + if (std::find(Users.begin(), Users.end(), *UI) == Users.end()) { + SafeToFetch = false; + break; + } + } + if (SafeToFetch) { + MadeChange = true; + ToFetch->moveBefore(BB->getFirstNonPHI()); + Worklist.erase(It); + addPatternMembers(ToFetch, Preheader, BB, Users, Worklist); + break; + } + } + if (It == EIt) + break; + } + } + } + } + return MadeChange; +} + /// Eliminate blocks that contain only PHI nodes, debug info directives, and an /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split /// edges in ways that are non-optimal for isel. Start by eliminating these Index: test/Transforms/CodeGenPrepare/reverse-licm.ll =================================================================== --- /dev/null +++ test/Transforms/CodeGenPrepare/reverse-licm.ll @@ -0,0 +1,523 @@ +; RUN: opt -codegenprepare -S < %s | FileCheck %s + +define void @rolv(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @rolv +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: bitcast +; CHECK-NEXT: and +; CHECK-NEXT: sub +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: shl +; CHECK-NEXT: select +; CHECK-NEXT: lshr +; CHECK-NEXT: or +; CHECK-NEXT: extractelement +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %0, + %2 = sub nsw <4 x i32> , %1 + %3 = icmp ult <4 x i32> %2, + %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %5 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 16 + %7 = shl <4 x i32> %6, %1 + %8 = select <4 x i1> %3, <4 x i32> %6, <4 x i32> zeroinitializer + %9 = lshr <4 x i32> %8, %4 + %10 = or <4 x i32> %9, %7 + %11 = extractelement <4 x i32> %10, i32 0 + %idxprom1 = sext i32 %11 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %12 = trunc i64 %indvars.iv to i32 + store i32 %12, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @rorv(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @rorv +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: bitcast +; CHECK-NEXT: and +; CHECK-NEXT: sub +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: shl +; CHECK-NEXT: lshr +; CHECK-NEXT: or +; CHECK-NEXT: extractelement +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = bitcast <2 x i64> %b to <4 x i32> + %1 = and <4 x i32> %0, + %2 = sub nsw <4 x i32> , %1 + %3 = icmp ult <4 x i32> %2, + %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %5 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %6 = load <4 x i32>, <4 x i32>* %5, align 16 + %7 = select <4 x i1> %3, <4 x i32> %6, <4 x i32> zeroinitializer + %8 = shl <4 x i32> %7, %4 + %9 = lshr <4 x i32> %6, %1 + %10 = or <4 x i32> %8, %9 + %11 = extractelement <4 x i32> %10, i32 0 + %idxprom1 = sext i32 %11 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %12 = trunc i64 %indvars.iv to i32 + store i32 %12, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @sllv(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +entry: +; CHECK-LABEL: @sllv +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: bitcast +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: shl + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = bitcast <2 x i64> %b to <4 x i32> + %1 = icmp ult <4 x i32> %0, + %2 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %3 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %4 = load <4 x i32>, <4 x i32>* %3, align 16 + %5 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> zeroinitializer + %6 = shl <4 x i32> %5, %2 + %7 = extractelement <4 x i32> %6, i32 0 + %idxprom1 = sext i32 %7 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %8 = trunc i64 %indvars.iv to i32 + store i32 %8, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @sll(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @sll +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: extractelement +; CHECK-NEXT: trunc +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: shl +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = extractelement <2 x i64> %b, i64 0 + %1 = icmp ult i64 %0, 32 + %2 = trunc i64 %0 to i32 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %2, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = select i1 %1, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %4 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + %6 = select i1 %1, <4 x i32> %5, <4 x i32> zeroinitializer + %7 = shl <4 x i32> %6, %3 + %8 = extractelement <4 x i32> %7, i32 0 + %idxprom1 = sext i32 %8 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %9 = trunc i64 %indvars.iv to i32 + store i32 %9, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @slli(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, i32 %b) { +; CHECK-LABEL: @slli +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: shl +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = icmp ult i32 %b, 32 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = select i1 %0, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %2 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %3 = load <4 x i32>, <4 x i32>* %2, align 16 + %4 = select i1 %0, <4 x i32> %3, <4 x i32> zeroinitializer + %5 = shl <4 x i32> %4, %1 + %6 = extractelement <4 x i32> %5, i32 0 + %idxprom1 = sext i32 %6 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %7 = trunc i64 %indvars.iv to i32 + store i32 %7, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @srlv(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @srlv +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: bitcast +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: lshr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = bitcast <2 x i64> %b to <4 x i32> + %1 = icmp ult <4 x i32> %0, + %2 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %3 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %4 = load <4 x i32>, <4 x i32>* %3, align 16 + %5 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> zeroinitializer + %6 = lshr <4 x i32> %5, %2 + %7 = extractelement <4 x i32> %6, i32 0 + %idxprom1 = sext i32 %7 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %8 = trunc i64 %indvars.iv to i32 + store i32 %8, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @srl(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @srl +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: extractelement +; CHECK-NEXT: trunc +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: lshr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = extractelement <2 x i64> %b, i64 0 + %1 = icmp ult i64 %0, 32 + %2 = trunc i64 %0 to i32 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %2, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = select i1 %1, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %4 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + %6 = select i1 %1, <4 x i32> %5, <4 x i32> zeroinitializer + %7 = lshr <4 x i32> %6, %3 + %8 = extractelement <4 x i32> %7, i32 0 + %idxprom1 = sext i32 %8 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %9 = trunc i64 %indvars.iv to i32 + store i32 %9, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: norecurse nounwind uwtable +define void @srli(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, i32 %b) { +; CHECK-LABEL: @srli +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: select +; CHECK-NEXT: lshr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = icmp ult i32 %b, 32 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = select i1 %0, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %2 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %3 = load <4 x i32>, <4 x i32>* %2, align 16 + %4 = select i1 %0, <4 x i32> %3, <4 x i32> zeroinitializer + %5 = lshr <4 x i32> %4, %1 + %6 = extractelement <4 x i32> %5, i32 0 + %idxprom1 = sext i32 %6 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %7 = trunc i64 %indvars.iv to i32 + store i32 %7, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @srav(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @srav +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: bitcast +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: and +; CHECK-NEXT: select +; CHECK-NEXT: ashr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = bitcast <2 x i64> %b to <4 x i32> + %1 = icmp ult <4 x i32> %0, + %2 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %3 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %4 = load <4 x i32>, <4 x i32>* %3, align 16 + %5 = and <4 x i32> %4, + %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %5 + %7 = ashr <4 x i32> %6, %2 + %8 = extractelement <4 x i32> %7, i32 0 + %idxprom1 = sext i32 %8 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %9 = trunc i64 %indvars.iv to i32 + store i32 %9, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @sra(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, <2 x i64> %b) { +; CHECK-LABEL: @sra +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: extractelement +; CHECK-NEXT: trunc +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: and +; CHECK-NEXT: select +; CHECK-NEXT: ashr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = extractelement <2 x i64> %b, i64 0 + %1 = icmp ult i64 %0, 31 + %2 = trunc i64 %0 to i32 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %2, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %3 = select i1 %1, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %4 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %5 = load <4 x i32>, <4 x i32>* %4, align 16 + %6 = and <4 x i32> %5, + %7 = select i1 %1, <4 x i32> %5, <4 x i32> %6 + %8 = ashr <4 x i32> %7, %3 + %9 = extractelement <4 x i32> %8, i32 0 + %idxprom1 = sext i32 %9 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %10 = trunc i64 %indvars.iv to i32 + store i32 %10, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @srai(i32* nocapture %x, i32 %N, <2 x i64>* nocapture readonly %a, i32 %b) { +; CHECK-LABEL: @srai +; CHECK: for.body: +; CHECK-NEXT: phi +; CHECK-NEXT: insertelement +; CHECK-NEXT: shufflevector +; CHECK-NEXT: icmp +; CHECK-NEXT: select +; CHECK-NEXT: getelementptr +; CHECK-NEXT: bitcast +; CHECK-NEXT: load +; CHECK-NEXT: and +; CHECK-NEXT: select +; CHECK-NEXT: ashr +entry: + %cmp7 = icmp eq i32 %N, 0 + br i1 %cmp7, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %0 = icmp ult i32 %b, 31 + %.splatinsert.i = insertelement <4 x i32> undef, i32 %b, i32 0 + %.splat.i = shufflevector <4 x i32> %.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = select i1 %0, <4 x i32> %.splat.i, <4 x i32> zeroinitializer + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %a, i64 %indvars.iv + %2 = bitcast <2 x i64>* %arrayidx to <4 x i32>* + %3 = load <4 x i32>, <4 x i32>* %2, align 16 + %4 = and <4 x i32> %3, + %5 = select i1 %0, <4 x i32> %3, <4 x i32> %4 + %6 = ashr <4 x i32> %5, %1 + %7 = extractelement <4 x i32> %6, i32 0 + %idxprom1 = sext i32 %7 to i64 + %arrayidx2 = getelementptr inbounds i32, i32* %x, i64 %idxprom1 + %8 = trunc i64 %indvars.iv to i32 + store i32 %8, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}