Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -441,10 +441,15 @@ /// the block that was created for it. void sinkScalarOperands(Instruction *PredInst); - /// Predicate conditional instructions that require predication on their - /// respective conditions. + /// Predicate the instructions in the vectorized loop whose corresponding + /// instructions in the scalar loop have been marked scalar-with-predication + /// by the legality analysis. void predicateInstructions(); + /// Predicate the instructions in \p PredInsts by the condition \p Cmp. All + /// the given instructions will be placed in the same basic block. + void predicateInstructions(ArrayRef PredInsts, Value *Cmp); + /// Collect the instructions from the original loop that would be trivially /// dead in the vectorized loop if generated. void collectTriviallyDeadInstructions(); @@ -474,12 +479,9 @@ /// and update the analysis passes. void updateAnalysis(); - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. If \p IfPredicateInstr is true we need to 'hide' each - /// scalarized instruction behind an if block predicated on the control - /// dependence of the instruction. - virtual void scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr = false); + /// Represent instruction \p Instr from the original loop as a sequence of + /// scalar instructions in the vectorized loop. + virtual void scalarizeInstruction(Instruction *Instr); /// Vectorize Load and Store instructions, virtual void vectorizeMemoryInstruction(Instruction *Instr); @@ -754,10 +756,13 @@ /// vectorized and scalarized. ValueMap VectorLoopValueMap; - /// Store instructions that should be predicated, as a pair - /// - SmallVector, 4> PredicatedInstructions; + /// Holds the predicates created for the edges between given source and + /// destination blocks. EdgeMaskCache MaskCache; + + /// Holds the predicates created for entry into the given basic blocks. + DenseMap BlockInCache; + /// Trip count of the original loop. Value *TripCount; /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) @@ -794,8 +799,7 @@ UnrollFactor, LVL, CM) {} private: - void scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr = false) override; + void scalarizeInstruction(Instruction *Instr) override; void vectorizeMemoryInstruction(Instruction *Instr) override; Value *getBroadcastInstrs(Value *V) override; Value *getStepVector(Value *Val, int StartIdx, Value *Step, @@ -2787,7 +2791,7 @@ // Scalarize the memory instruction if necessary. if (Legal->memoryInstructionMustBeScalarized(Instr, VF)) - return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); + return scalarizeInstruction(Instr); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. @@ -2957,12 +2961,9 @@ VectorLoopValueMap.initVector(Instr, Entry); } -void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); - DEBUG(dbgs() << "LV: Scalarizing" - << (IfPredicateInstr ? " and predicating:" : ":") << *Instr - << '\n'); + DEBUG(dbgs() << "LV: Scalarizing: " << *Instr << '\n'); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -2974,9 +2975,11 @@ // Initialize a new scalar map entry. ScalarParts Entry(UF); - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); + // If the instruction requires predication, emit the block-in mask for its + // parent block. The mask will be stored in BlockInCache and made available + // for reuse (e.g., when performing the actual predication). + if (Legal->isScalarWithPredication(Instr)) + createBlockInMask(Instr->getParent()); // Determine the number of scalars we need to generate for each unroll // iteration. If the instruction is uniform, we only need to generate the @@ -2988,15 +2991,6 @@ Entry[Part].resize(VF); // For each scalar that we create: for (unsigned Lane = 0; Lane < Lanes; ++Lane) { - - // Start if-block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, - ConstantInt::get(Cmp->getType(), 1)); - } - Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -3019,10 +3013,6 @@ if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } } VectorLoopValueMap.initScalar(Instr, Entry); @@ -4267,6 +4257,10 @@ // reanalyzed if we don't yet know if we can sink it or not. SmallVector InstsToReanalyze; + // The location where an instruction will be sunk. This location is updated + // whenever we sink a new instruction. + Instruction *InsertPoint = PredInst; + // Returns true if a given use occurs in the predicated block. Phi nodes use // their operands in their corresponding predecessor blocks. auto isBlockOfUsePredicated = [&](Use &U) -> bool { @@ -4308,9 +4302,10 @@ continue; } - // Move the instruction to the beginning of the predicated block, and add - // it's operands to the worklist. - I->moveBefore(&*PredBB->getFirstInsertionPt()); + // Move the instruction to the insert point, and add it's operands to the + // worklist. We update the insert point to be the newly sunk instruction. + I->moveBefore(InsertPoint); + InsertPoint = I; Worklist.insert(I->op_begin(), I->op_end()); // The sinking may have enabled other instructions to be sunk, so we will @@ -4320,15 +4315,19 @@ } while (Changed); } -void InnerLoopVectorizer::predicateInstructions() { - - // For each instruction I marked for predication on value C, split I into its - // own basic block to form an if-then construct over C. Since I may be fed by - // an extractelement instruction or other scalar operand, we try to - // iteratively sink its scalar operands into the predicated block. If I feeds - // an insertelement instruction, we try to move this instruction into the - // predicated block as well. For non-void types, a phi node will be created - // for the resulting value (either vector or scalar). +void InnerLoopVectorizer::predicateInstructions( + ArrayRef PredInsts, Value *Cmp) { + + // Predicate all instructions in PredInsts on value Cmp by placing the + // instructions into a single, newly created basic block, forming an if-then + // construction over Cmp. The instructions in PredInsts are assumed to be + // scalarized instructions from the vector loop corresponding to the same + // unroll part and vector lane. Since the instructions may be fed by + // extractelement instructions or other scalar operands, we try to + // iteratively sink these scalar operands into the predicated block. If an + // instruction feeds an insertelement instruction, we try to move this + // instruction into the predicated block as well. For non-void types, a phi + // node will be created for the resulting value (either vector or scalar). // // So for some predicated instruction, e.g. the conditional sdiv in: // @@ -4371,46 +4370,51 @@ // %5 = add nsw <2 x i32> %4, %wide.load // %8 = icmp sgt <2 x i32> %wide.load52, // %9 = extractelement <2 x i1> %8, i32 0 - // br i1 %9, label %pred.sdiv.if, label %pred.sdiv.continue + // br i1 %9, label %pred.if, label %pred.continue // - // pred.sdiv.if: + // pred.if: // %10 = extractelement <2 x i32> %wide.load, i32 0 // %11 = extractelement <2 x i32> %wide.load51, i32 0 // %12 = sdiv i32 %10, %11 // %13 = insertelement <2 x i32> undef, i32 %12, i32 0 - // br label %pred.sdiv.continue + // br label %pred.continue // - // pred.sdiv.continue: - // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.sdiv.if ] + // pred.continue: + // %14 = phi <2 x i32> [ undef, %vector.body ], [ %13, %pred.if ] // %15 = extractelement <2 x i1> %8, i32 1 - // br i1 %15, label %pred.sdiv.if54, label %pred.sdiv.continue55 + // br i1 %15, label %pred.if54, label %pred.continue55 // - // pred.sdiv.if54: + // pred.if54: // %16 = extractelement <2 x i32> %wide.load, i32 1 // %17 = extractelement <2 x i32> %wide.load51, i32 1 // %18 = sdiv i32 %16, %17 // %19 = insertelement <2 x i32> %14, i32 %18, i32 1 - // br label %pred.sdiv.continue55 + // br label %pred.continue55 // - // pred.sdiv.continue55: - // %20 = phi <2 x i32> [ %14, %pred.sdiv.continue ], [ %19, %pred.sdiv.if54 ] + // pred.continue55: + // %20 = phi <2 x i32> [ %14, %pred.continue ], [ %19, %pred.if54 ] // %predphi = select <2 x i1> %8, <2 x i32> %20, <2 x i32> %5 - for (auto KV : PredicatedInstructions) { - BasicBlock::iterator I(KV.first); - BasicBlock *Head = I->getParent(); - auto *BB = SplitBlock(Head, &*std::next(I), DT, LI); - auto *T = SplitBlockAndInsertIfThen(KV.second, &*I, /*Unreachable=*/false, - /*BranchWeights=*/nullptr, DT, LI); + BasicBlock::iterator Front(PredInsts.front()); + BasicBlock *Head = Front->getParent(); + BasicBlock *BB = SplitBlock(Head, &*std::next(Front), DT, LI); + TerminatorInst *T = SplitBlockAndInsertIfThen( + Cmp, &*Front, /*Unreachable=*/false, /*BranchWeights=*/nullptr, DT, LI); + T->getParent()->setName("pred.if"); + BB->setName("pred.continue"); + + // Holds instructions whose uses will need to be replaced by the phi nodes we + // create. We maintain a vector of these pairs so we can perform the + // replacements after all instructions have been predicated and sunk. + SmallVector, 4> Replacements; + + for (Instruction *I : PredInsts) { I->moveBefore(T); sinkScalarOperands(&*I); - I->getParent()->setName(Twine("pred.") + I->getOpcodeName() + ".if"); - BB->setName(Twine("pred.") + I->getOpcodeName() + ".continue"); - // If the instruction is non-void create a Phi node at reconvergence point. if (!I->getType()->isVoidTy()) { - Value *IncomingTrue = nullptr; + Instruction *IncomingTrue = nullptr; Value *IncomingFalse = nullptr; if (I->hasOneUse() && isa(*I->user_begin())) { @@ -4430,15 +4434,84 @@ assert(PostDom && "Then block has multiple successors"); PHINode *Phi = PHINode::Create(IncomingTrue->getType(), 2, "", &PostDom->front()); - IncomingTrue->replaceAllUsesWith(Phi); Phi->addIncoming(IncomingFalse, Head); Phi->addIncoming(IncomingTrue, I->getParent()); + Replacements.push_back(std::make_pair(IncomingTrue, Phi)); } } + // Replace all uses of the predicated instruction (or insertelement + // instruction) with the new phi node we created for it. We ignore uses in + // the same basic block and the use by the phi node itself. + for (std::pair &R : Replacements) + for (User *U : R.first->users()) { + if (auto *I = dyn_cast(U)) + if (I == R.second || R.first->getParent() == I->getParent()) + continue; + U->replaceUsesOfWith(R.first, R.second); + } + DEBUG(DT->verifyDomTree()); } +void InnerLoopVectorizer::predicateInstructions() { + for (BasicBlock *BB : OrigLoop->blocks()) { + if (!Legal->blockNeedsPredication(BB)) + continue; + + // Collect the instructions in the original loop whose corresponding + // instructions in the vector loop must be predicated. + SmallVector ScalarLoopPredInsts; + for (Instruction &I : *BB) + if (Legal->isScalarWithPredication(&I)) { + assert(!Legal->isUniformAfterVectorization(&I) && + "Uniform after vectorization instruction requires predication"); + DEBUG(dbgs() << "LV: Predicating: " << I << '\n'); + ScalarLoopPredInsts.push_back(&I); + } + if (ScalarLoopPredInsts.empty()) + continue; + + // Set the insert point to the first instruction that requires predication. + // Note that the instruction must have been scalarized when vectorizing the + // loop since it requires predication. + Builder.SetInsertPoint( + cast(getScalarValue(ScalarLoopPredInsts.front(), 0, 0))); + + // Create the block mask. We do this once for all the instructions in the + // block. + VectorParts Cond = createBlockInMask(BB); + + // We're going to create a single block corresponding to each of the VF x + // UF iterations of the original loop. + for (unsigned Part = 0; Part < UF; ++Part) + for (unsigned Lane = 0; Lane < VF; ++Lane) { + + // Collect the instructions in the vector loop for this lane and part + // corresponding to each instruction in ScalarLoopPredInsts. + SmallVector VectorLoopPredInsts; + for (Instruction *I : ScalarLoopPredInsts) + VectorLoopPredInsts.push_back( + cast(getScalarValue(I, Part, Lane))); + + // Set the insert point to the first instruction that requires + // predication for this lane and part. + Builder.SetInsertPoint(VectorLoopPredInsts.front()); + + // Get the block mask value corresponding to this lane and part. + Value *Cmp = Cond[Part]; + if (VF > 1) + Cmp = + Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane)); + Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, + ConstantInt::get(Cmp->getType(), 1)); + + // Predicate the instructions in VectorLoopPredInsts with Cmp. + predicateInstructions(VectorLoopPredInsts, Cmp); + } + } +} + InnerLoopVectorizer::VectorParts InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -4477,10 +4550,17 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + // If the block-in mask for this basic block is cached, return it. + auto BICEntryIt = BlockInCache.find(BB); + if (BICEntryIt != BlockInCache.end()) + return BICEntryIt->second; + // Loop incoming mask is all-one. if (OrigLoop->getHeader() == BB) { Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); - return getVectorValue(C); + VectorParts BlockMask = getVectorValue(C); + BlockInCache[BB] = BlockMask; + return BlockMask; } // This is the block mask. We OR all incoming edges, and with zero. @@ -4494,6 +4574,7 @@ BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]); } + BlockInCache[BB] = BlockMask; return BlockMask; } @@ -4673,7 +4754,7 @@ // Scalarize with predication if this instruction may divide by zero and // block execution is conditional, otherwise fallthrough. if (Legal->isScalarWithPredication(&I)) { - scalarizeInstruction(&I, true); + scalarizeInstruction(&I); continue; } case Instruction::Add: @@ -7014,8 +7095,7 @@ VecValuesToIgnore.insert(&I); } -void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr, - bool IfPredicateInstr) { +void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector Params; @@ -7028,25 +7108,17 @@ // Initialize a new scalar map entry. ScalarParts Entry(UF); - VectorParts Cond; - if (IfPredicateInstr) - Cond = createBlockInMask(Instr->getParent()); + // If the instruction requires predication, emit the block-in mask for its + // parent block. The mask will be stored in BlockInCache and made available + // for reuse (e.g., when performing the actual predication). + if (Legal->isScalarWithPredication(Instr)) + createBlockInMask(Instr->getParent()); // For each vector unroll 'part': for (unsigned Part = 0; Part < UF; ++Part) { Entry[Part].resize(1); // For each scalar that we create: - // Start an "if (pred) a[i] = ..." block. - Value *Cmp = nullptr; - if (IfPredicateInstr) { - if (Cond[Part]->getType()->isVectorTy()) - Cond[Part] = - Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0)); - Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part], - ConstantInt::get(Cond[Part]->getType(), 1)); - } - Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) Cloned->setName(Instr->getName() + ".cloned"); @@ -7068,19 +7140,12 @@ if (auto *II = dyn_cast(Cloned)) if (II->getIntrinsicID() == Intrinsic::assume) AC->registerAssumption(II); - - // End if-block. - if (IfPredicateInstr) - PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp)); } VectorLoopValueMap.initScalar(Instr, Entry); } void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { - auto *SI = dyn_cast(Instr); - bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); - - return scalarizeInstruction(Instr, IfPredicateInstr); + return scalarizeInstruction(Instr); } Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; } Index: test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -19,7 +19,8 @@ ; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 ; ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Scalarizing: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Predicating: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(i32* %a, i32* %b, i1 %c, i64 %n) { entry: @@ -60,7 +61,8 @@ ; (store(4) + extractelement(6)) / 2 = 5 ; ; CHECK: Found an estimated cost of 5 for VF 2 For instruction: store i32 %tmp2, i32* %tmp0, align 4 -; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Scalarizing: store i32 %tmp2, i32* %tmp0, align 4 +; CHECK: Predicating: store i32 %tmp2, i32* %tmp0, align 4 ; define void @predicated_store(i32* %a, i1 %c, i32 %x, i64 %n) { entry: Index: test/Transforms/LoopVectorize/if-pred-non-void.ll =================================================================== --- test/Transforms/LoopVectorize/if-pred-non-void.ll +++ test/Transforms/LoopVectorize/if-pred-non-void.ll @@ -16,65 +16,64 @@ ret void ; CHECK-LABEL: test -; CHECK: vector.body: -; CHECK: %[[SDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: %[[SDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEE]], true -; CHECK: br i1 %[[SDCC]], label %[[CSD:[a-zA-Z0-9.]+]], label %[[ESD:[a-zA-Z0-9.]+]] -; CHECK: [[CSD]]: -; CHECK: %[[SDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SD0:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0]], %[[SDA1]] -; CHECK: %[[SD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SD0]], i32 0 -; CHECK: br label %[[ESD]] -; CHECK: [[ESD]]: -; CHECK: %[[SDR:[a-zA-Z0-9]+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[SD1]], %[[CSD]] ] -; CHECK: %[[SDEEH:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 1 -; CHECK: %[[SDCCH:[a-zA-Z0-9]+]] = icmp eq i1 %[[SDEEH]], true -; CHECK: br i1 %[[SDCCH]], label %[[CSDH:[a-zA-Z0-9.]+]], label %[[ESDH:[a-zA-Z0-9.]+]] -; CHECK: [[CSDH]]: -; CHECK: %[[SDA0H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 -; CHECK: %[[SDA1H:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 1 -; CHECK: %[[SD0H:[a-zA-Z0-9]+]] = sdiv i32 %[[SDA0H]], %[[SDA1H]] -; CHECK: %[[SD1H:[a-zA-Z0-9]+]] = insertelement <2 x i32> %[[SDR]], i32 %[[SD0H]], i32 1 -; CHECK: br label %[[ESDH]] -; CHECK: [[ESDH]]: -; CHECK: %{{.*}} = phi <2 x i32> [ %[[SDR]], %[[ESD]] ], [ %[[SD1H]], %[[CSDH]] ] - -; CHECK: %[[UDEE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: %[[UDCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UDEE]], true -; CHECK: br i1 %[[UDCC]], label %[[CUD:[a-zA-Z0-9.]+]], label %[[EUD:[a-zA-Z0-9.]+]] -; CHECK: [[CUD]]: -; CHECK: %[[UDA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[UDA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[UD0:[a-zA-Z0-9]+]] = udiv i32 %[[UDA0]], %[[UDA1]] -; CHECK: %[[UD1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UD0]], i32 0 -; CHECK: br label %[[EUD]] -; CHECK: [[EUD]]: -; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UD1]], %[[CUD]] ] - -; CHECK: %[[SREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: %[[SRCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[SREE]], true -; CHECK: br i1 %[[SRCC]], label %[[CSR:[a-zA-Z0-9.]+]], label %[[ESR:[a-zA-Z0-9.]+]] -; CHECK: [[CSR]]: -; CHECK: %[[SRA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SRA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[SR0:[a-zA-Z0-9]+]] = srem i32 %[[SRA0]], %[[SRA1]] -; CHECK: %[[SR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[SR0]], i32 0 -; CHECK: br label %[[ESR]] -; CHECK: [[ESR]]: -; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[SR1]], %[[CSR]] ] - -; CHECK: %[[UREE:[a-zA-Z0-9]+]] = extractelement <2 x i1> %{{.*}}, i32 0 -; CHECK: %[[URCC:[a-zA-Z0-9]+]] = icmp eq i1 %[[UREE]], true -; CHECK: br i1 %[[URCC]], label %[[CUR:[a-zA-Z0-9.]+]], label %[[EUR:[a-zA-Z0-9.]+]] -; CHECK: [[CUR]]: -; CHECK: %[[URA0:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[URA1:[a-zA-Z0-9]+]] = extractelement <2 x i32> %{{.*}}, i32 0 -; CHECK: %[[UR0:[a-zA-Z0-9]+]] = urem i32 %[[URA0]], %[[URA1]] -; CHECK: %[[UR1:[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 %[[UR0]], i32 0 -; CHECK: br label %[[EUR]] -; CHECK: [[EUR]]: -; CHECK: %{{.*}} = phi <2 x i32> [ undef, %{{.*}} ], [ %[[UR1]], %[[CUR]] ] +; CHECK: vector.body: +; CHECK: %[[ADD0:.+]] = add nsw <2 x i32> %[[LOAD0:.+]], +; CHECK-NEXT: %[[ADD1:.+]] = add nsw <2 x i32> %[[LOAD1:.+]], +; CHECK-NEXT: %[[ADD2:.+]] = add nsw <2 x i32> %[[LOAD2:.+]], +; CHECK-NEXT: %[[ADD3:.+]] = add nsw <2 x i32> %[[LOAD3:.+]], +; CHECK: %[[COND0:.+]] = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK-NEXT: %[[CMP0:.+]] = icmp eq i1 %[[COND0]], true +; CHECK-NEXT: br i1 %[[CMP0]], label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK-NEXT: %[[E0_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 0 +; CHECK-NEXT: %[[E0_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 0 +; CHECK-NEXT: %[[SDIV0:.+]] = sdiv i32 %[[E0_0]], %[[E0_1]] +; CHECK-NEXT: %[[I0_0:.+]] = insertelement <2 x i32> undef, i32 %[[SDIV0]], i32 0 +; CHECK-NEXT: %[[E0_2:.+]] = extractelement <2 x i32> %[[ADD1]], i32 0 +; CHECK-NEXT: %[[E0_3:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 0 +; CHECK-NEXT: %[[UDIV0:.+]] = udiv i32 %[[E0_2]], %[[E0_3]] +; CHECK-NEXT: %[[I0_1:.+]] = insertelement <2 x i32> undef, i32 %[[UDIV0]], i32 0 +; CHECK-NEXT: %[[E0_4:.+]] = extractelement <2 x i32> %[[ADD2]], i32 0 +; CHECK-NEXT: %[[E0_5:.+]] = extractelement <2 x i32> %[[LOAD2]], i32 0 +; CHECK-NEXT: %[[SREM0:.+]] = srem i32 %[[E0_4]], %[[E0_5]] +; CHECK-NEXT: %[[I0_2:.+]] = insertelement <2 x i32> undef, i32 %[[SREM0]], i32 0 +; CHECK-NEXT: %[[E0_6:.+]] = extractelement <2 x i32> %[[ADD3]], i32 0 +; CHECK-NEXT: %[[E0_7:.+]] = extractelement <2 x i32> %[[LOAD3]], i32 0 +; CHECK-NEXT: %[[UREM0:.+]] = urem i32 %[[E0_6]], %[[E0_7]] +; CHECK-NEXT: %[[I0_3:.+]] = insertelement <2 x i32> undef, i32 %[[UREM0]], i32 0 +; CHECK-NEXT: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK-NEXT: %[[PHI3:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0_3]], %[[IF0]] ] +; CHECK-NEXT: %[[PHI2:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0_2]], %[[IF0]] ] +; CHECK-NEXT: %[[PHI1:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0_1]], %[[IF0]] ] +; CHECK-NEXT: %[[PHI0:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0_0]], %[[IF0]] ] +; CHECK-NEXT: %[[COND1:.+]] = extractelement <2 x i1> %{{.*}}, i32 1 +; CHECK-NEXT: %[[CMP1:.+]] = icmp eq i1 %[[COND1]], true +; CHECK-NEXT: br i1 %[[CMP1]], label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK-NEXT: %[[E1_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 1 +; CHECK-NEXT: %[[E1_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 1 +; CHECK-NEXT: %[[SDIV1:.+]] = sdiv i32 %[[E1_0]], %[[E1_1]] +; CHECK-NEXT: %[[I1_0:.+]] = insertelement <2 x i32> %[[PHI0]], i32 %[[SDIV1]], i32 1 +; CHECK-NEXT: %[[E1_2:.+]] = extractelement <2 x i32> %[[ADD1]], i32 1 +; CHECK-NEXT: %[[E1_3:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 1 +; CHECK-NEXT: %[[UDIV1:.+]] = udiv i32 %[[E1_2]], %[[E1_3]] +; CHECK-NEXT: %[[I1_1:.+]] = insertelement <2 x i32> %[[PHI1]], i32 %[[UDIV1]], i32 1 +; CHECK-NEXT: %[[E1_4:.+]] = extractelement <2 x i32> %[[ADD2]], i32 1 +; CHECK-NEXT: %[[E1_5:.+]] = extractelement <2 x i32> %[[LOAD2]], i32 1 +; CHECK-NEXT: %[[SREM1:.+]] = srem i32 %[[E1_4]], %[[E1_5]] +; CHECK-NEXT: %[[I1_2:.+]] = insertelement <2 x i32> %[[PHI2]], i32 %[[SREM1]], i32 1 +; CHECK-NEXT: %[[E1_6:.+]] = extractelement <2 x i32> %[[ADD3]], i32 1 +; CHECK-NEXT: %[[E1_7:.+]] = extractelement <2 x i32> %[[LOAD3]], i32 1 +; CHECK-NEXT: %[[UREM1:.+]] = urem i32 %[[E1_6]], %[[E1_7]] +; CHECK-NEXT: %[[I1_3:.+]] = insertelement <2 x i32> %[[PHI3]], i32 %[[UREM1]], i32 1 +; CHECK-NEXT: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK-NEXT: phi <2 x i32> [ %[[PHI3]], %[[CONT0]] ], [ %[[I1_3]], %[[IF1]] ] +; CHECK-NEXT: phi <2 x i32> [ %[[PHI2]], %[[CONT0]] ], [ %[[I1_2]], %[[IF1]] ] +; CHECK-NEXT: phi <2 x i32> [ %[[PHI1]], %[[CONT0]] ], [ %[[I1_1]], %[[IF1]] ] +; CHECK-NEXT: phi <2 x i32> [ %[[PHI0]], %[[CONT0]] ], [ %[[I1_0]], %[[IF1]] ] +; CHECK: br {{.*}} label %middle.block, label %vector.body for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] @@ -122,13 +121,39 @@ ret void ; CHECK-LABEL: test_scalar2scalar -; CHECK: vector.body: -; CHECK: br i1 %{{.*}}, label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] -; CHECK: [[THEN]]: -; CHECK: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} -; CHECK: br label %[[FI]] -; CHECK: [[FI]]: -; CHECK: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ] +; CHECK: vector.body: +; CHECK: %[[LOAD0:.+]] = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: %[[LOAD1:.+]] = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: %[[ADD0:.+]] = add nsw <2 x i32> %[[LOAD0]], +; CHECK: %[[COND0:.+]] = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK-NEXT: %[[CMP0:.+]] = icmp eq i1 %[[COND0]], true +; CHECK-NEXT: br i1 %[[CMP0]], label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK-NEXT: %[[E0_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 0 +; CHECK-NEXT: %[[E0_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 0 +; CHECK-NEXT: %[[SDIV0_0:.+]] = sdiv i32 %[[E0_0]], %[[E0_1]] +; CHECK-NEXT: %[[E0_2:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 0 +; CHECK-NEXT: %[[SDIV0_1:.+]] = sdiv i32 %[[E0_2]], %[[SDIV0_0]] +; CHECK-NEXT: %[[I0:.+]] = insertelement <2 x i32> undef, i32 %[[SDIV0_1]], i32 0 +; CHECK-NEXT: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK-NEXT: %[[PHI:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0]], %[[IF0]] ] +; CHECK-NEXT: phi i32 [ undef, %vector.body ], [ %[[SDIV0_0]], %[[IF0]] ] +; CHECK-NEXT: %[[COND1:.+]] = extractelement <2 x i1> %{{.*}}, i32 1 +; CHECK-NEXT: %[[CMP1:.+]] = icmp eq i1 %[[COND1]], true +; CHECK-NEXT: br i1 %[[CMP1]], label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK-NEXT: %[[E1_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 1 +; CHECK-NEXT: %[[E1_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 1 +; CHECK-NEXT: %[[SDIV1_0:.+]] = sdiv i32 %[[E1_0]], %[[E1_1]] +; CHECK-NEXT: %[[E1_2:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 1 +; CHECK-NEXT: %[[SDIV1_1:.+]] = sdiv i32 %[[E1_2]], %[[SDIV1_0]] +; CHECK-NEXT: %[[I1:.+]] = insertelement <2 x i32> %[[PHI]], i32 %[[SDIV1_1]], i32 1 +; CHECK-NEXT: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK-NEXT: phi <2 x i32> [ %[[PHI]], %[[CONT0]] ], [ %[[I1]], %[[IF1]] ] +; CHECK-NEXT: phi i32 [ undef, %[[CONT0]] ], [ %[[SDIV1_0]], %[[IF1]] ] +; CHECK: br {{.*}} label %middle.block, label %vector.body for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] @@ -161,25 +186,39 @@ ret void ; CHECK-LABEL: pr30172 -; CHECK: vector.body: -; CHECK: %[[CMP1:.+]] = icmp slt <2 x i32> %[[VAL:.+]], -; CHECK: %[[CMP2:.+]] = icmp sge <2 x i32> %[[VAL]], -; CHECK: %[[XOR:.+]] = xor <2 x i1> %[[CMP1]], -; CHECK: %[[AND1:.+]] = and <2 x i1> %[[XOR]], -; CHECK: %[[OR1:.+]] = or <2 x i1> zeroinitializer, %[[AND1]] -; CHECK: %[[AND2:.+]] = and <2 x i1> %[[CMP2]], %[[OR1]] -; CHECK: %[[OR2:.+]] = or <2 x i1> zeroinitializer, %[[AND2]] -; CHECK: %[[AND3:.+]] = and <2 x i1> %[[CMP1]], -; CHECK: %[[OR3:.+]] = or <2 x i1> %[[OR2]], %[[AND3]] -; CHECK: %[[EXTRACT:.+]] = extractelement <2 x i1> %[[OR3]], i32 0 -; CHECK: %[[MASK:.+]] = icmp eq i1 %[[EXTRACT]], true -; CHECK: br i1 %[[MASK]], label %[[THEN:[a-zA-Z0-9.]+]], label %[[FI:[a-zA-Z0-9.]+]] -; CHECK: [[THEN]]: -; CHECK: %[[PD:[a-zA-Z0-9]+]] = sdiv i32 %{{.*}}, %{{.*}} -; CHECK: br label %[[FI]] -; CHECK: [[FI]]: -; CHECK: %{{.*}} = phi i32 [ undef, %vector.body ], [ %[[PD]], %[[THEN]] ] - +; CHECK: vector.body: +; CHECK: %[[LOAD0:.+]] = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: %[[LOAD1:.+]] = load <2 x i32>, <2 x i32>* {{.*}}, align 4 +; CHECK: %[[ADD0:.+]] = add nsw <2 x i32> %[[LOAD0]], +; CHECK: %[[COND0:.+]] = extractelement <2 x i1> %{{.*}}, i32 0 +; CHECK-NEXT: %[[CMP0:.+]] = icmp eq i1 %[[COND0]], true +; CHECK-NEXT: br i1 %[[CMP0]], label %[[IF0:.+]], label %[[CONT0:.+]] +; CHECK: [[IF0]]: +; CHECK-NEXT: %[[E0_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 0 +; CHECK-NEXT: %[[E0_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 0 +; CHECK-NEXT: %[[SDIV0_0:.+]] = sdiv i32 %[[E0_0]], %[[E0_1]] +; CHECK-NEXT: %[[E0_2:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 0 +; CHECK-NEXT: %[[SDIV0_1:.+]] = sdiv i32 %[[E0_2]], %[[SDIV0_0]] +; CHECK-NEXT: %[[I0:.+]] = insertelement <2 x i32> undef, i32 %[[SDIV0_1]], i32 0 +; CHECK-NEXT: br label %[[CONT0]] +; CHECK: [[CONT0]]: +; CHECK-NEXT: %[[PHI:.+]] = phi <2 x i32> [ undef, %vector.body ], [ %[[I0]], %[[IF0]] ] +; CHECK-NEXT: phi i32 [ undef, %vector.body ], [ %[[SDIV0_0]], %[[IF0]] ] +; CHECK-NEXT: %[[COND1:.+]] = extractelement <2 x i1> %{{.*}}, i32 1 +; CHECK-NEXT: %[[CMP1:.+]] = icmp eq i1 %[[COND1]], true +; CHECK-NEXT: br i1 %[[CMP1]], label %[[IF1:.+]], label %[[CONT1:.+]] +; CHECK: [[IF1]]: +; CHECK-NEXT: %[[E1_0:.+]] = extractelement <2 x i32> %[[ADD0]], i32 1 +; CHECK-NEXT: %[[E1_1:.+]] = extractelement <2 x i32> %[[LOAD0]], i32 1 +; CHECK-NEXT: %[[SDIV1_0:.+]] = sdiv i32 %[[E1_0]], %[[E1_1]] +; CHECK-NEXT: %[[E1_2:.+]] = extractelement <2 x i32> %[[LOAD1]], i32 1 +; CHECK-NEXT: %[[SDIV1_1:.+]] = sdiv i32 %[[E1_2]], %[[SDIV1_0]] +; CHECK-NEXT: %[[I1:.+]] = insertelement <2 x i32> %[[PHI]], i32 %[[SDIV1_1]], i32 1 +; CHECK-NEXT: br label %[[CONT1]] +; CHECK: [[CONT1]]: +; CHECK-NEXT: phi <2 x i32> [ %[[PHI]], %[[CONT0]] ], [ %[[I1]], %[[IF1]] ] +; CHECK-NEXT: phi i32 [ undef, %[[CONT0]] ], [ %[[SDIV1_0]], %[[IF1]] ] +; CHECK: br {{.*}} label %middle.block, label %vector.body for.body: ; preds = %if.end, %entry %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ] Index: test/Transforms/LoopVectorize/induction.ll =================================================================== --- test/Transforms/LoopVectorize/induction.ll +++ test/Transforms/LoopVectorize/induction.ll @@ -301,59 +301,59 @@ ; ; CHECK-LABEL: @scalarize_induction_variable_05( ; CHECK: vector.body: -; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ] +; CHECK: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.continue2 ] ; CHECK: %[[I0:.+]] = add i32 %index, 0 ; CHECK: getelementptr inbounds i32, i32* %a, i32 %[[I0]] -; CHECK: pred.udiv.if: +; CHECK: pred.if: ; CHECK: udiv i32 {{.*}}, %[[I0]] -; CHECK: pred.udiv.if1: +; CHECK: pred.if1: ; CHECK: %[[I1:.+]] = add i32 %index, 1 ; CHECK: udiv i32 {{.*}}, %[[I1]] ; ; UNROLL-NO_IC-LABEL: @scalarize_induction_variable_05( ; UNROLL-NO-IC: vector.body: -; UNROLL-NO-IC: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ] +; UNROLL-NO-IC: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.continue11 ] ; UNROLL-NO-IC: %[[I0:.+]] = add i32 %index, 0 ; UNROLL-NO-IC: %[[I2:.+]] = add i32 %index, 2 ; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I0]] ; UNROLL-NO-IC: getelementptr inbounds i32, i32* %a, i32 %[[I2]] -; UNROLL-NO-IC: pred.udiv.if: +; UNROLL-NO-IC: pred.if: ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I0]] -; UNROLL-NO-IC: pred.udiv.if6: +; UNROLL-NO-IC: pred.if6: ; UNROLL-NO-IC: %[[I1:.+]] = add i32 %index, 1 ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I1]] -; UNROLL-NO-IC: pred.udiv.if8: +; UNROLL-NO-IC: pred.if8: ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I2]] -; UNROLL-NO-IC: pred.udiv.if10: +; UNROLL-NO-IC: pred.if10: ; UNROLL-NO-IC: %[[I3:.+]] = add i32 %index, 3 ; UNROLL-NO-IC: udiv i32 {{.*}}, %[[I3]] ; ; IND-LABEL: @scalarize_induction_variable_05( ; IND: vector.body: -; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue2 ] +; IND: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.continue2 ] ; IND: %[[E0:.+]] = sext i32 %index to i64 ; IND: getelementptr inbounds i32, i32* %a, i64 %[[E0]] -; IND: pred.udiv.if: +; IND: pred.if: ; IND: udiv i32 {{.*}}, %index -; IND: pred.udiv.if1: +; IND: pred.if1: ; IND: %[[I1:.+]] = or i32 %index, 1 ; IND: udiv i32 {{.*}}, %[[I1]] ; ; UNROLL-LABEL: @scalarize_induction_variable_05( ; UNROLL: vector.body: -; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.udiv.continue11 ] +; UNROLL: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %pred.continue11 ] ; UNROLL: %[[I2:.+]] = or i32 %index, 2 ; UNROLL: %[[E0:.+]] = sext i32 %index to i64 ; UNROLL: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]] ; UNROLL: getelementptr i32, i32* %[[G0]], i64 2 -; UNROLL: pred.udiv.if: +; UNROLL: pred.if: ; UNROLL: udiv i32 {{.*}}, %index -; UNROLL: pred.udiv.if6: +; UNROLL: pred.if6: ; UNROLL: %[[I1:.+]] = or i32 %index, 1 ; UNROLL: udiv i32 {{.*}}, %[[I1]] -; UNROLL: pred.udiv.if8: +; UNROLL: pred.if8: ; UNROLL: udiv i32 {{.*}}, %[[I2]] -; UNROLL: pred.udiv.if10: +; UNROLL: pred.if10: ; UNROLL: %[[I3:.+]] = or i32 %index, 3 ; UNROLL: udiv i32 {{.*}}, %[[I3]] Index: test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll =================================================================== --- test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll +++ test/Transforms/LoopVectorize/interleaved-accesses-pred-stores.ll @@ -19,11 +19,11 @@ ; CHECK: %wide.vec = load <4 x i64>, <4 x i64>* %{{.*}} ; CHECK: %strided.vec = shufflevector <4 x i64> %wide.vec, <4 x i64> undef, <2 x i32> ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 ; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 ; CHECK: store i64 %[[X2]], {{.*}} @@ -68,15 +68,15 @@ ; CHECK: %[[L1:.+]] = load <4 x i64>, <4 x i64>* %{{.*}} ; CHECK: %strided.vec = shufflevector <4 x i64> %[[L1]], <4 x i64> undef, <2 x i32> ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 ; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 ; CHECK: store i64 %[[X2]], {{.*}} ; -; CHECK: pred.store.continue +; CHECK: pred.continue ; CHECK: %[[L2:.+]] = load <4 x i64>, <4 x i64>* {{.*}} ; CHECK: %[[X3:.+]] = extractelement <4 x i64> %[[L2]], i32 0 ; CHECK: store i64 %[[X3]], {{.*}} @@ -129,11 +129,11 @@ ; CHECK: store i64 %x, {{.*}} ; CHECK: store i64 %x, {{.*}} ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X1:.+]] = extractelement <4 x i64> %wide.vec, i32 0 ; CHECK: store i64 %[[X1]], {{.*}} ; -; CHECK: pred.store.if +; CHECK: pred.if ; CHECK: %[[X2:.+]] = extractelement <4 x i64> %wide.vec, i32 2 ; CHECK: store i64 %[[X2]], {{.*}}