Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -138,7 +138,7 @@ bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); bool vectorizeStoreChain(ArrayRef Chain, slpvectorizer::BoUpSLP &R, - unsigned VecRegSize); + unsigned VecRegSize, bool OnlyBitParallel); bool vectorizeStores(ArrayRef Stores, slpvectorizer::BoUpSLP &R); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -336,7 +336,7 @@ } /// \returns analysis of the Instructions in \p VL described in -/// InstructionsState, the Opcode that we suppose the whole list +/// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { @@ -498,6 +498,7 @@ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, + bool IsSwar, ArrayRef UserIgnoreLst = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for @@ -505,6 +506,7 @@ /// into account (anf updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef Roots, + bool IsSwar, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst = None); @@ -521,6 +523,7 @@ BS->clear(); } MinBWs.clear(); + IsSwar = false; } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -573,6 +576,9 @@ /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + /// \returns whether the VectorizableTree has external uses. + bool hasExternalUses() const { return !ExternalUses.empty(); } + OptimizationRemarkEmitter *getORE() { return ORE; } private: @@ -1208,6 +1214,11 @@ /// value must be signed-extended, rather than zero-extended, back to its /// original width. MapVector> MinBWs; + + /// Is this a SWAR vectorization ? If true, the result type is a scalar type + /// and not a vector type. The "lanes" of the vector are contiguous bit + /// intervals (e.g. i64 is split into bits [63-32] and [31-0]). + bool IsSwar = false; }; } // end namespace slpvectorizer @@ -1291,15 +1302,18 @@ } // end namespace llvm void BoUpSLP::buildTree(ArrayRef Roots, + bool IsSwar, ArrayRef UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; - buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); + buildTree(Roots, IsSwar, ExternallyUsedValues, UserIgnoreLst); } void BoUpSLP::buildTree(ArrayRef Roots, + bool IsSwar, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { deleteTree(); + this->IsSwar = IsSwar; UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; @@ -1364,6 +1378,11 @@ } } +static bool isBitParallel(unsigned Op) { + // FIXME: Handle ICmp, And, Or, Xor, BitCast. + return Op == Instruction::Load || Op == Instruction::Store; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1501,6 +1520,13 @@ unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned) Instruction::ShuffleVector : S.Opcode; + + if (IsSwar && !isBitParallel(ShuffleOrOp)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to non bit-parallel SWAR.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } + switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -1627,6 +1653,11 @@ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. + if (IsSwar) { + LLVM_DEBUG(dbgs() << "SLP: shuffle in SWAR.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, @@ -3010,7 +3041,9 @@ Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + VectorType *const VecTy = IsSwar ? nullptr : VectorType::get(ScalarTy, E->Scalars.size()); + IntegerType *const SwarTy = IsSwar ? IntegerType::get(F->getContext(), ScalarTy->getIntegerBitWidth() * E->Scalars.size()) : nullptr; + Type* const VecOrSwarTy = IsSwar ? static_cast(SwarTy) : static_cast(VecTy); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3306,7 +3339,7 @@ unsigned AS = LI->getPointerAddressSpace(); Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + VecOrSwarTy->getPointerTo(AS)); // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the @@ -3323,12 +3356,14 @@ LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); if (IsReorder) { + assert(!IsSwar); OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { + assert(!IsSwar); // TODO: Merge this shuffle with the ReorderShuffleMask. V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -3350,7 +3385,7 @@ Value *VecValue = vectorizeTree(ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecOrSwarTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); // The pointer operand uses an in-tree scalar, so add the new BitCast to @@ -3365,6 +3400,7 @@ ST->setAlignment(Alignment); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { + assert(!IsSwar); V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); } @@ -3566,6 +3602,7 @@ // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { + assert(!IsSwar && "not implemented: extract in SWAR"); Value *Scalar = ExternalUse.Scalar; llvm::User *User = ExternalUse.User; @@ -4633,7 +4670,7 @@ } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, - unsigned VecRegSize) { + unsigned VecRegSize, const bool IsSwar) { const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); @@ -4658,9 +4695,13 @@ << "\n"); ArrayRef Operands = Chain.slice(i, VF); - R.buildTree(Operands); + R.buildTree(Operands, IsSwar); if (R.isTreeTinyAndNotFullyVectorizable()) continue; + if (IsSwar && R.hasExternalUses()) { + LLVM_DEBUG(dbgs() << "SLP: Ignoring SWAR tree with external uses\n"); + continue; + } R.computeMinimumValueSizes(); @@ -4755,13 +4796,26 @@ // register size is a power-of-2? for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { - if (vectorizeStoreChain(Operands, R, Size)) { + if (vectorizeStoreChain(Operands, R, Size, false)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; break; } } + // Now try to vectorize using SWAR (https://en.wikipedia.org/wiki/SWAR). + // Only allow operations that are instrinsically bit-parallel. + // FIXME: Extend to logical bitwise operations (e.g. XOR/OR/AND). We will + // need to check flags. + // FIXME: Extend to heterogeneous sizes (< 2xi8, 1xi16, 1xi32>). This is + // easy for copies but requires careful handling of shuffles to avoid + // generating inefficient code. + if (!Changed && vectorizeStoreChain(Operands, R, TTI->getRegisterBitWidth(false), true)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } } return Changed; @@ -4889,7 +4943,7 @@ << "\n"); ArrayRef Ops = VL.slice(I, OpsWidth); - R.buildTree(Ops); + R.buildTree(Ops, false); Optional> Order = R.bestOrder(); // TODO: check if we can allow reordering for more cases. if (AllowReorder && Order) { @@ -4900,7 +4954,7 @@ // there are exactly two operations. assert(Ops.size() == 2); Value *ReorderedOps[] = {Ops[1], Ops[0]}; - R.buildTree(ReorderedOps, None); + R.buildTree(ReorderedOps, false, None); } if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -5638,7 +5692,7 @@ IgnoreList.append(V.begin(), V.end()); while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, IgnoreList); + V.buildTree(VL, false, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == VL.size()) { @@ -5646,7 +5700,7 @@ SmallVector ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); - V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); + V.buildTree(ReorderedOps, false, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) break; Index: test/Transforms/SLPVectorizer/X86/swar.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/swar.ll @@ -0,0 +1,228 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +; This tests vectorization of bit-parallel operations (e.g. COPY) using SWAR. +; +; four_i32 tests vectorization of 4xi32 copy. This is vectorized using a vector +; register. +; +; two_i32 tests vectorization of 2xi32 copy. Copying (load/store without +; modifications) is trivially bit-parallel and can be vectorized using SWAR. +; +; two_i32_swap tests vectorization of 2xi32 copy with swapping. +; +; two_i32_add negative-tests vectorization of 2xi32 ADD. This should NOT be +; vectorized as ADD is not bit-parallel. + + +; four_i32 +; +;struct S { +; int32_t a; +; int32_t b; +; int32_t c; +; int32_t d; +; int64_t e; +; int32_t f; +;}; +; +;S copy_2xi32(const S& s) { +; S result; +; result.a = s.a; +; result.b = s.b; +; result.c = s.c; +; result.d = s.d; +; return result; +;} + +%struct.S4x32 = type { i32, i32, i32, i32, i64, i32 } + +define void @four_i32(%struct.S4x32* noalias nocapture sret, %struct.S4x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @four_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32:%.*]], %struct.S4x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[C_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 2 +; CHECK-NEXT: [[C_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 2 +; CHECK-NEXT: [[D_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[D_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + %c_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 2 + %c = load i32, i32* %c_src_ptr, align 8 + %c_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 2 + store i32 %c, i32* %c_dst_ptr, align 8 + %d_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 3 + %d = load i32, i32* %d_src_ptr, align 8 + %d_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 3 + store i32 %d, i32* %d_dst_ptr, align 8 + ret void +} + +; two_i32 +; +;struct S { +; int32_t a; +; int32_t b; +; int64_t c; +; int32_t d; +;}; +; +;S copy_2xi32(const S& s) { +; S result; +; result.a = s.a; +; result.b = s.b; +; return result; +;} + +%struct.S2x32 = type { i32, i32, i64, i32 } + +define void @two_i32(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + ret void +} + +define void @two_i32_swap(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_swap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[REORDER_SHUFFLE]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %b, i32* %b_dst_ptr, align 8 + ret void +} + +define void @two_i32_add(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_add( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* [[A_SRC_PTR]], align 8 +; CHECK-NEXT: [[A_PLUS_1:%.*]] = add nsw i32 [[A]], 1 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: store i32 [[A_PLUS_1]], i32* [[A_DST_PTR]], align 8 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[B:%.*]] = load i32, i32* [[B_SRC_PTR]], align 8 +; CHECK-NEXT: [[B_PLUS_1:%.*]] = add nsw i32 [[B]], 1 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: store i32 [[B_PLUS_1]], i32* [[B_DST_PTR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_plus_1 = add nsw i32 %a, 1 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a_plus_1, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_plus_1 = add nsw i32 %b, 1 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b_plus_1, i32* %b_dst_ptr, align 8 + ret void +} + +define i32 @two_i32_extract(%struct.S2x32* noalias nocapture, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_extuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: ret i32 [[TMP5]] +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + ret i32 %b +} + +define i32 @two_i32_insert(%struct.S2x32* noalias nocapture, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_extuse( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: ret i32 [[TMP5]] +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + ret i32 %b +} Index: test/Transforms/SLPVectorizer/X86/tiny-tree.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -172,13 +172,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1