Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1629,15 +1629,16 @@ break; } - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - if (ReverseConsecutive) { --NumOpsWantToKeepOrder[S.Opcode]; + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering reversed loads.\n"); - } else { - DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + return; } + + DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -2245,6 +2246,10 @@ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + if (!isConsecutiveAccess(VL[0], VL[1], *DL, *SE)) { + VecLdCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { @@ -3222,6 +3227,13 @@ } LI->setAlignment(Alignment); Value *V = propagateMetadata(LI, E->Scalars); + if (!isConsecutiveAccess(E->Scalars[0], E->Scalars[1], *DL, *SE)) { + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (uint32_t I = E->Scalars.size(); I > 0; --I) + Mask.emplace_back(I - 1); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask); + } if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); Index: test/Transforms/SLPVectorizer/X86/PR32086.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR32086.ll +++ test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -33,15 +33,15 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_reversed( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8 -; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[ARRAYIDX1]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: store i64 [[T1]], i64* [[ST]], align 8 -; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX3]], align 8 -; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX4]], align 8 -; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1