Index: include/llvm/Transforms/Vectorize/SLPVectorizer.h =================================================================== --- include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -138,7 +138,7 @@ bool vectorizeChainsInBlock(BasicBlock *BB, slpvectorizer::BoUpSLP &R); bool vectorizeStoreChain(ArrayRef Chain, slpvectorizer::BoUpSLP &R, - unsigned VecRegSize); + unsigned VecRegSize, bool OnlyBitParallel); bool vectorizeStores(ArrayRef Stores, slpvectorizer::BoUpSLP &R); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -336,7 +336,7 @@ } /// \returns analysis of the Instructions in \p VL described in -/// InstructionsState, the Opcode that we suppose the whole list +/// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { @@ -498,6 +498,7 @@ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. void buildTree(ArrayRef Roots, + bool OnlyBitParallel, ArrayRef UserIgnoreLst = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for @@ -505,6 +506,7 @@ /// into account (anf updating it, if required) list of externally used /// values stored in \p ExternallyUsedValues. void buildTree(ArrayRef Roots, + bool OnlyBitParallel, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst = None); @@ -585,7 +587,7 @@ int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int); + void buildTree_rec(ArrayRef Roots, bool OnlyBitParallel, unsigned Depth, int); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -1291,19 +1293,21 @@ } // end namespace llvm void BoUpSLP::buildTree(ArrayRef Roots, + bool OnlyBitParallel, ArrayRef UserIgnoreLst) { ExtraValueToDebugLocsMap ExternallyUsedValues; - buildTree(Roots, ExternallyUsedValues, UserIgnoreLst); + buildTree(Roots, OnlyBitParallel, ExternallyUsedValues, UserIgnoreLst); } void BoUpSLP::buildTree(ArrayRef Roots, + bool OnlyBitParallel, ExtraValueToDebugLocsMap &ExternallyUsedValues, ArrayRef UserIgnoreLst) { deleteTree(); UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots, OnlyBitParallel, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -1364,7 +1368,12 @@ } } -void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, +static bool isBitParallel(unsigned Op) { + // FIXME: Handle ICmp, And, Or, Xor. + return Op == Instruction::Load || Op == Instruction::BitCast || Op == Instruction::Store; +} + +void BoUpSLP::buildTree_rec(ArrayRef VL, bool OnlyBitParallel, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1501,6 +1510,13 @@ unsigned ShuffleOrOp = S.isAltShuffle() ? (unsigned) Instruction::ShuffleVector : S.Opcode; + + if (OnlyBitParallel && !isBitParallel(ShuffleOrOp)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } + switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -1530,7 +1546,7 @@ Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; } @@ -1674,7 +1690,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; } @@ -1704,7 +1720,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; } @@ -1735,8 +1751,8 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(Left, OnlyBitParallel, Depth + 1, UserTreeIdx); + buildTree_rec(Right, OnlyBitParallel, Depth + 1, UserTreeIdx); return; } @@ -1746,7 +1762,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; @@ -1795,7 +1811,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; } @@ -1816,7 +1832,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1879,7 +1895,7 @@ CallInst *CI2 = dyn_cast(j); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; } @@ -1899,8 +1915,8 @@ if (isa(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(Left, OnlyBitParallel, Depth + 1, UserTreeIdx); + buildTree_rec(Right, OnlyBitParallel, Depth + 1, UserTreeIdx); return; } @@ -1910,7 +1926,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(Operands, OnlyBitParallel, Depth + 1, UserTreeIdx); } return; @@ -4639,7 +4655,7 @@ } bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, - unsigned VecRegSize) { + unsigned VecRegSize, const bool OnlyBitParallel) { const unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); @@ -4664,7 +4680,7 @@ << "\n"); ArrayRef Operands = Chain.slice(i, VF); - R.buildTree(Operands); + R.buildTree(Operands, OnlyBitParallel); if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -4761,13 +4777,26 @@ // register size is a power-of-2? for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize(); Size /= 2) { - if (vectorizeStoreChain(Operands, R, Size)) { + if (vectorizeStoreChain(Operands, R, Size, false)) { // Mark the vectorized stores so that we don't vectorize them again. VectorizedStores.insert(Operands.begin(), Operands.end()); Changed = true; break; } } + // Now try to vectorize using SWAR (https://en.wikipedia.org/wiki/SWAR). + // Only allow operations that are instrinsically bit-parallel. + // FIXME: Extend to logical bitwise operations (e.g. XOR/OR/AND). We will + // need to check flags. + // FIXME: Extend to heterogeneous sizes (< 2xi8, 1xi16, 1xi32>). This is + // easy for copies but requires careful handling of shuffles to avoid + // generating inefficient code. + if (!Changed && vectorizeStoreChain(Operands, R, TTI->getRegisterBitWidth(false), true)) { + // Mark the vectorized stores so that we don't vectorize them again. + VectorizedStores.insert(Operands.begin(), Operands.end()); + Changed = true; + break; + } } return Changed; @@ -4895,7 +4924,7 @@ << "\n"); ArrayRef Ops = VL.slice(I, OpsWidth); - R.buildTree(Ops); + R.buildTree(Ops, false); Optional> Order = R.bestOrder(); // TODO: check if we can allow reordering for more cases. if (AllowReorder && Order) { @@ -4906,7 +4935,7 @@ // there are exactly two operations. assert(Ops.size() == 2); Value *ReorderedOps[] = {Ops[1], Ops[0]}; - R.buildTree(ReorderedOps, None); + R.buildTree(ReorderedOps, false, None); } if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -5644,7 +5673,7 @@ IgnoreList.append(V.begin(), V.end()); while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) { auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth); - V.buildTree(VL, ExternallyUsedValues, IgnoreList); + V.buildTree(VL, false, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == VL.size()) { @@ -5652,7 +5681,7 @@ SmallVector ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); - V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); + V.buildTree(ReorderedOps, false, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) break; Index: test/Transforms/SLPVectorizer/X86/swar.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/swar.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-unknown-linux -mcpu=corei7 | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux" + +; This tests vectorization of bit-parallel operations (e.g. COPY) using SWAR. +; +; four_i32 tests vectorization of 4xi32 copy. This is vectorized using a vector +; register. +; +; two_i32 tests vectorization of 2xi32 copy. Copying (load/store without +; modifications) is trivially bit-parallel and can be vectorized using SWAR. +; +; two_i32_swap tests vectorization of 2xi32 copy with swapping. +; +; two_i32_add negative-tests vectorization of 2xi32 ADD. This should NOT be +; vectorized as ADD is not bit-parallel. + + +; four_i32 +; +;struct S { +; int32_t a; +; int32_t b; +; int32_t c; +; int32_t d; +; int64_t e; +; int32_t f; +;}; +; +;S copy_2xi32(const S& s) { +; S result; +; result.a = s.a; +; result.b = s.b; +; result.c = s.c; +; result.d = s.d; +; return result; +;} + +%struct.S4x32 = type { i32, i32, i32, i32, i64, i32 } + +define void @four_i32(%struct.S4x32* noalias nocapture sret, %struct.S4x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @four_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32:%.*]], %struct.S4x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[C_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 2 +; CHECK-NEXT: [[C_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 2 +; CHECK-NEXT: [[D_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP1]], i64 0, i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[D_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S4X32]], %struct.S4x32* [[TMP0]], i64 0, i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + %c_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 2 + %c = load i32, i32* %c_src_ptr, align 8 + %c_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 2 + store i32 %c, i32* %c_dst_ptr, align 8 + %d_src_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %1, i64 0, i32 3 + %d = load i32, i32* %d_src_ptr, align 8 + %d_dst_ptr = getelementptr inbounds %struct.S4x32, %struct.S4x32* %0, i64 0, i32 3 + store i32 %d, i32* %d_dst_ptr, align 8 + ret void +} + +; two_i32 +; +;struct S { +; int32_t a; +; int32_t b; +; int64_t c; +; int32_t d; +;}; +; +;S copy_2xi32(const S& s) { +; S result; +; result.a = s.a; +; result.b = s.b; +; return result; +;} + +%struct.S2x32 = type { i32, i32, i64, i32 } + +define void @two_i32(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[A_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b, i32* %b_dst_ptr, align 8 + ret void +} + +define void @two_i32_swap(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_swap( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A_SRC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 8 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B_DST_PTR]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[REORDER_SHUFFLE]], <2 x i32>* [[TMP4]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %a, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %b, i32* %b_dst_ptr, align 8 + ret void +} + +define void @two_i32_add(%struct.S2x32* noalias nocapture sret, %struct.S2x32* nocapture readonly dereferenceable(24)) { +; CHECK-LABEL: @two_i32_add( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32:%.*]], %struct.S2x32* [[TMP1:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[A:%.*]] = load i32, i32* [[A_SRC_PTR]], align 8 +; CHECK-NEXT: [[A_PLUS_1:%.*]] = add nsw i32 [[A]], 1 +; CHECK-NEXT: [[A_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0:%.*]], i64 0, i32 0 +; CHECK-NEXT: store i32 [[A_PLUS_1]], i32* [[A_DST_PTR]], align 8 +; CHECK-NEXT: [[B_SRC_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP1]], i64 0, i32 1 +; CHECK-NEXT: [[B:%.*]] = load i32, i32* [[B_SRC_PTR]], align 8 +; CHECK-NEXT: [[B_PLUS_1:%.*]] = add nsw i32 [[B]], 1 +; CHECK-NEXT: [[B_DST_PTR:%.*]] = getelementptr inbounds [[STRUCT_S2X32]], %struct.S2x32* [[TMP0]], i64 0, i32 1 +; CHECK-NEXT: store i32 [[B_PLUS_1]], i32* [[B_DST_PTR]], align 8 +; CHECK-NEXT: ret void +; +entry: + %a_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 0 + %a = load i32, i32* %a_src_ptr, align 8 + %a_plus_1 = add nsw i32 %a, 1 + %a_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 0 + store i32 %a_plus_1, i32* %a_dst_ptr, align 8 + %b_src_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %1, i64 0, i32 1 + %b = load i32, i32* %b_src_ptr, align 8 + %b_plus_1 = add nsw i32 %b, 1 + %b_dst_ptr = getelementptr inbounds %struct.S2x32, %struct.S2x32* %0, i64 0, i32 1 + store i32 %b_plus_1, i32* %b_dst_ptr, align 8 + ret void +} Index: test/Transforms/SLPVectorizer/X86/tiny-tree.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -172,13 +172,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1