diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2727,7 +2727,8 @@ } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { - if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { + if (MemoryDepSD->hasValidDependencies() && + MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; @@ -2782,7 +2783,8 @@ void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { doForAllOpcodes(I, [&](ScheduleData *SD) { - if (SD->isSchedulingEntity() && SD->isReady()) { + if (SD->isSchedulingEntity() && SD->hasValidDependencies() && + SD->isReady()) { ReadyList.insert(SD); LLVM_DEBUG(dbgs() << "SLP: initially in ready list: " << *SD << "\n"); @@ -7872,6 +7874,11 @@ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); + // A key point - if we got here, pre-scheduling was able to find a valid + // scheduling of the sub-graph of the scheduling window which consists + // of all vector bundles and their transitive users. As such, we do not + // need to reschedule anything *outside of* that subgraph. + BS->resetSchedule(); // For the real scheduling we use a more sophisticated ready-list: it is @@ -7884,21 +7891,19 @@ }; std::set ReadyInsts; - // Ensure that all dependency data is updated and fill the ready-list with - // initial instructions. + // Ensure that all dependency data is updated (for nodes in the sub-graph) + // and fill the ready-list with initial instructions. int Idx = 0; - int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) { assert((isVectorLikeInstWithConstOps(SD->Inst) || SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; - if (SD->isSchedulingEntity()) { + + if (SD->isSchedulingEntity() && SD->isPartOfBundle()) BS->calculateDependencies(SD, false, this); - NumToSchedule++; - } }); } BS->initialFillReadyList(ReadyInsts); @@ -7921,9 +7926,7 @@ } BS->schedule(picked, ReadyInsts); - NumToSchedule--; } - assert(NumToSchedule == 0 && "could not schedule all instructions"); // Check that we didn't break any of our invariants. #ifdef EXPENSIVE_CHECKS diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll @@ -9,11 +9,11 @@ ; CHECK-LABEL: @f( ; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0 ; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1 +; CHECK-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0 +; CHECK-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]] -; CHECK-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0 -; CHECK-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>* ; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -12,9 +12,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] @@ -57,9 +57,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] ; CHECK: for.body3.lr.ph: -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -36,6 +36,7 @@ ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -85,7 +86,6 @@ ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -111,6 +111,7 @@ ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -160,7 +161,6 @@ ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -297,6 +297,7 @@ ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -346,7 +347,6 @@ ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -372,6 +372,7 @@ ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -421,7 +422,6 @@ ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -37,9 +37,9 @@ ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] @@ -163,9 +163,9 @@ ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] @@ -274,10 +274,10 @@ ; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6 ; CHECK-NEXT: [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6 ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7 +; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32> -; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>* ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll @@ -18,6 +18,7 @@ ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2 ; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3 +; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> @@ -32,7 +33,6 @@ ; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8> -; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[DST]] to <4 x i8>* ; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -17,58 +17,58 @@ ; CHECK-NEXT: [[TEMP2:%.*]] = load double, double* [[ARRAYIDX5_I]], align 8 ; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 1 +; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1 +; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2 +; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2 +; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3 +; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3 +; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0 +; CHECK-NEXT: [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8 +; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 +; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 +; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX3_I]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TEMP]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7_I]] to <2 x double>* ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[TEMP2]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 2 -; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 2 -; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 0, i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x double>, <2 x double>* [[TMP12]], align 8 -; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[TMP4]], [[TMP13]] -; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], [4 x double]* [[B]], i64 1, i64 3 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* -; CHECK-NEXT: [[TMP16:%.*]] = load <2 x double>, <2 x double>* [[TMP15]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP9]], [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[TMP14]], [[TMP17]] -; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 0 -; CHECK-NEXT: [[TEMP10:%.*]] = load double, double* [[ARRAYIDX47_I]], align 8 -; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[A]], i64 1, i64 1 -; CHECK-NEXT: [[TEMP11:%.*]] = load double, double* [[ARRAYIDX52_I]], align 8 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TEMP10]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP2]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x double> [[TMP22]], double [[TEMP11]], i32 1 -; CHECK-NEXT: [[TMP24:%.*]] = fmul <2 x double> [[TMP7]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP21]], [[TMP24]] -; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP13]], [[TMP20]] -; CHECK-NEXT: [[TMP27:%.*]] = fmul <2 x double> [[TMP16]], [[TMP23]] -; CHECK-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[TMP26]], [[TMP27]] -; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 1 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[OUT]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP29]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[OUT]] to <2 x double>* ; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 2 ; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 3 -; CHECK-NEXT: [[TMP30:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP18]], <2 x double>* [[TMP30]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[ARRAYIDX25_I]] to <2 x double>* +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP4]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[ARRAYIDX30_I]] to <2 x double>* +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x double>, <2 x double>* [[TMP16]], align 8 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP9]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <2 x double> [[TMP15]], [[TMP18]] +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast double* [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[TMP20]], align 8 ; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 4 ; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 5 -; CHECK-NEXT: [[TMP31:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP25]], <2 x double>* [[TMP31]], align 8 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x double> [[TMP21]], double [[TEMP10]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP2]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x double> [[TMP24]], double [[TEMP11]], i32 1 +; CHECK-NEXT: [[TMP26:%.*]] = fmul <2 x double> [[TMP7]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP23]], [[TMP26]] +; CHECK-NEXT: [[TMP28:%.*]] = bitcast double* [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP27]], <2 x double>* [[TMP28]], align 8 ; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 6 ; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 7 +; CHECK-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[TMP14]], [[TMP22]] +; CHECK-NEXT: [[TMP30:%.*]] = fmul <2 x double> [[TMP17]], [[TMP25]] +; CHECK-NEXT: [[TMP31:%.*]] = fadd <2 x double> [[TMP29]], [[TMP30]] ; CHECK-NEXT: [[TMP32:%.*]] = bitcast double* [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP28]], <2 x double>* [[TMP32]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP31]], <2 x double>* [[TMP32]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1.i = getelementptr inbounds [2 x double], [2 x double]* %A, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -326,10 +326,10 @@ ; CHECK-LABEL: @no_version( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SRC_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 +; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i32> [[TMP1]], -; CHECK-NEXT: [[DST_GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>* ; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void @@ -902,12 +902,7 @@ ; CHECK-NEXT: [[A_GEP_14:%.*]] = getelementptr i8, i8* [[A]], i64 14 ; CHECK-NEXT: [[B_GEP_14:%.*]] = getelementptr i8, i8* [[B]], i64 14 ; CHECK-NEXT: [[A_GEP_15:%.*]] = getelementptr i8, i8* [[A]], i64 15 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>* -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[B_GEP_15:%.*]] = getelementptr i8, i8* [[B]], i64 15 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>* -; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[R_GEP_0:%.*]] = getelementptr i8, i8* [[ARG1]], i64 0 ; CHECK-NEXT: [[R_GEP_1:%.*]] = getelementptr i8, i8* [[ARG1]], i64 1 ; CHECK-NEXT: [[R_GEP_2:%.*]] = getelementptr i8, i8* [[ARG1]], i64 2 @@ -924,6 +919,11 @@ ; CHECK-NEXT: [[R_GEP_13:%.*]] = getelementptr i8, i8* [[ARG1]], i64 13 ; CHECK-NEXT: [[R_GEP_14:%.*]] = getelementptr i8, i8* [[ARG1]], i64 14 ; CHECK-NEXT: [[R_GEP_15:%.*]] = getelementptr i8, i8* [[ARG1]], i64 15 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[A_GEP_0]] to <16 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B_GEP_0]] to <16 x i8>* +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = xor <16 x i8> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[R_GEP_0]] to <16 x i8>* ; CHECK-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[TMP5]], align 1 ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds i8, i8* [[ARG3]], i64 15 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll @@ -13,14 +13,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-and-reduction.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-or-reduction.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i8> [[TMP3]], [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-xor-reduction.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX3_6:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[A]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX]] to <8 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 -; CHECK-NEXT: [[ARRAYIDX3_7:%.*]] = getelementptr inbounds [[STRUCT_BUF]], %struct.buf* [[B]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ARRAYIDX3]] to <8 x i8>* ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP2]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[TMP1]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll @@ -15,13 +15,13 @@ ; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]] ; CHECK-NEXT: [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29:![0-9]+]] +; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG30:![0-9]+]] +; CHECK-NEXT: [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG31:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]] -; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]] -; CHECK-NEXT: [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA32:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]] -; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]] +; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA32]] ; CHECK-NEXT: ret void, !dbg [[DBG37:![0-9]+]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-order.ll @@ -15,15 +15,15 @@ ; CHECK-NEXT: [[CALL_I_I:%.*]] = call i32* @get_ptr() ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 2 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 1 +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3 +; CHECK-NEXT: [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[CALL_I_I]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 2 -; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr i32, i32* [[CALL_I_I]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[GEP_1]] to <2 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP5]] -; CHECK-NEXT: [[RES_1:%.*]] = getelementptr i64, i64* [[RES:%.*]], i64 1 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[RES]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* [[TMP7]], align 8 ; CHECK-NEXT: [[C:%.*]] = call i1 @cond() diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -135,13 +135,13 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] @@ -171,12 +171,12 @@ ; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -135,13 +135,13 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[V0]], i64 0 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_0]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0_1]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = sub <2 x i32> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2_31:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP2_31]] @@ -171,12 +171,12 @@ ; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = xor <2 x i32> [[V0]], [[V1]] -; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP3]] -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP6]] -; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[SHUFFLE]], [[TMP7]] +; CHECK-NEXT: [[TMP3_31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> [[TMP8]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_31]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s352.ll @@ -33,9 +33,9 @@ ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX6]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] @@ -48,9 +48,9 @@ ; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP9]] +; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>* ; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4 -; CHECK-NEXT: [[ARRAYIDX29:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>* ; CHECK-NEXT: [[TMP13:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP11]], [[TMP13]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/widen.ll @@ -25,14 +25,6 @@ ; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 13 ; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 14 ; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 15 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>* -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 -; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16> -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw <8 x i16> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = shl nuw <8 x i16> [[TMP6]], ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 3 @@ -48,10 +40,18 @@ ; CHECK-NEXT: [[ARRAYIDX3_13:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 13 ; CHECK-NEXT: [[ARRAYIDX3_14:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 14 ; CHECK-NEXT: [[ARRAYIDX3_15:%.*]] = getelementptr inbounds i16, i16* [[B]], i64 15 -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[B]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* [[TMP9]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[A]] to <8 x i8>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16> +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw <8 x i16> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[B]] to <8 x i16>* +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[ARRAYIDX_8]] to <8 x i8>* +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP6]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[TMP7]] to <8 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw <8 x i16> [[TMP8]], +; CHECK-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* [[TMP5]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[ARRAYIDX3_8]] to <8 x i16>* -; CHECK-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* [[TMP10]], align 2 +; CHECK-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2 ; CHECK-NEXT: ret void ; %arrayidx.1 = getelementptr inbounds i8, i8* %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/packed-math.ll @@ -191,10 +191,10 @@ ; GCN-LABEL: @test1_fabs_scalar_fma_v2f16( ; GCN-NEXT: [[I1:%.*]] = load half, half addrspace(3)* [[B:%.*]], align 2 ; GCN-NEXT: [[I1_FABS:%.*]] = call half @llvm.fabs.f16(half [[I1]]) -; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* -; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 ; GCN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds half, half addrspace(3)* [[B]], i64 1 ; GCN-NEXT: [[I4:%.*]] = load half, half addrspace(3)* [[ARRAYIDX4]], align 2 +; GCN-NEXT: [[TMP1:%.*]] = bitcast half addrspace(3)* [[A:%.*]] to <2 x half> addrspace(3)* +; GCN-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP1]], align 2 ; GCN-NEXT: [[TMP3:%.*]] = bitcast half addrspace(3)* [[C:%.*]] to <2 x half> addrspace(3)* ; GCN-NEXT: [[TMP4:%.*]] = load <2 x half>, <2 x half> addrspace(3)* [[TMP3]], align 2 ; GCN-NEXT: [[TMP5:%.*]] = insertelement <2 x half> poison, half [[I1_FABS]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll --- a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll +++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll @@ -14,11 +14,11 @@ ; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>* ; CHECK-NEXT: store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -14,13 +14,13 @@ ; CHECK-NEXT: [[ARRAYIDX372_1:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 1 ; CHECK-NEXT: [[ARRAYIDX372_2:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 2 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([4 x [4 x i32]], [4 x [4 x i32]]* @bar, i64 0, i64 3, i64 2) to <2 x i32>*), align 4 +; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], -; CHECK-NEXT: [[ARRAYIDX372_3:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* @dct_luma, i64 0, i64 3, i64 3 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX372]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -4,12 +4,12 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplified( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void @@ -33,12 +33,12 @@ define void @i64_simplifiedi_reversed(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_reversed( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void @@ -62,12 +62,12 @@ define void @i64_simplifiedi_extract(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplifiedi_extract( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[SHUFFLE]], i32 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -54,9 +54,11 @@ ; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP12:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] ; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 ; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555 +; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 +; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP3]]) ; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]] ; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]] @@ -88,9 +90,7 @@ ; FORCE_REDUCTION-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] ; FORCE_REDUCTION-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] ; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP2]] -; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]] -; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 ; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL_40]], i32 0 ; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 ; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[VAL_41]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -302,21 +302,21 @@ define void @reorder_alt_rightsubTree(double* nocapture %c, double* noalias nocapture readonly %a, double* noalias nocapture readonly %b, double* noalias nocapture readonly %d) { ; CHECK-LABEL: @reorder_alt_rightsubTree( ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[D:%.*]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[D]] to <2 x double>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[D]] to <2 x double>* ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 -; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[B]] to <2 x double>* -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x double> [[TMP6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fsub <2 x double> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP10]], [[TMP3]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[B]] to <2 x double>* +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub <2 x double> [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP11]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[C]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP15]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP14]], <2 x double>* [[TMP15]], align 8 ; CHECK-NEXT: ret void ; %1 = load double, double* %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/align.ll b/llvm/test/Transforms/SLPVectorizer/X86/align.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/align.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/align.ll @@ -12,9 +12,9 @@ ; CHECK-NEXT: [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1 ; CHECK-NEXT: [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] @@ -48,11 +48,11 @@ ; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1 ; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 ; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 ; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 ; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-abs.ll @@ -28,40 +28,40 @@ define void @abs_v8i64() { ; SSE-LABEL: @abs_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false) -; SSE-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) -; SSE-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false) -; SSE-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false) +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false) ; SSE-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP4]], i1 false) -; SLM-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP1]], i1 false) +; SLM-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP3]], i1 false) +; SLM-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP5]], i1 false) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP7]], i1 false) ; SLM-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false) -; AVX-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP2]], i1 false) -; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP1]], i1 false) +; AVX-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.abs.v4i64(<4 x i64> [[TMP3]], i1 false) ; AVX-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -101,40 +101,40 @@ define void @abs_v16i32() { ; SSE-LABEL: @abs_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false) -; SSE-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false) -; SSE-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false) -; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false) ; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP4]], i1 false) -; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP1]], i1 false) +; SLM-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP3]], i1 false) +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP5]], i1 false) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP7]], i1 false) ; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP2]], i1 false) -; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP3]], i1 false) ; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -198,40 +198,40 @@ define void @abs_v32i16() { ; SSE-LABEL: @abs_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false) -; SSE-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false) -; SSE-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false) -; SSE-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false) ; SSE-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP4]], i1 false) -; SLM-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP7]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP1]], i1 false) +; SLM-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP3]], i1 false) +; SLM-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP5]], i1 false) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP7]], i1 false) ; SLM-NEXT: store <8 x i16> [[TMP8]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false) -; AVX-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP2]], i1 false) -; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP1]], i1 false) +; AVX-NEXT: store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> [[TMP3]], i1 false) ; AVX-NEXT: store <16 x i16> [[TMP4]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -343,40 +343,40 @@ define void @abs_v64i8() { ; SSE-LABEL: @abs_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false) -; SSE-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false) -; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false) -; SSE-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false) +; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false) ; SSE-NEXT: store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @abs_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false) -; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP2]], i1 false) -; SLM-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false) -; SLM-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP4]], i1 false) -; SLM-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP1]], i1 false) +; SLM-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP3]], i1 false) +; SLM-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP5]], i1 false) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP7]], i1 false) ; SLM-NEXT: store <16 x i8> [[TMP8]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @abs_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false) -; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP2]], i1 false) -; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP1]], i1 false) +; AVX-NEXT: store <32 x i8> [[TMP2]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> [[TMP3]], i1 false) ; AVX-NEXT: store <32 x i8> [[TMP4]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -98,12 +98,12 @@ ; ; AVX-LABEL: @add_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -190,50 +190,50 @@ define void @add_v16i32() { ; SSE-LABEL: @add_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -314,50 +314,50 @@ define void @add_v32i16() { ; SSE-LABEL: @add_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -502,50 +502,50 @@ define void @add_v64i8() { ; SSE-LABEL: @add_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -63,12 +63,12 @@ ; ; AVX-LABEL: @add_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -117,31 +117,31 @@ define void @add_v16i32() { ; SSE-LABEL: @add_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @add_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -222,31 +222,31 @@ define void @add_v32i16() { ; SSE-LABEL: @add_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @add_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -391,31 +391,31 @@ define void @add_v64i8() { ; SSE-LABEL: @add_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @add_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -25,50 +25,50 @@ define void @add_v8i64() { ; SSE-LABEL: @add_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -117,50 +117,50 @@ define void @add_v16i32() { ; SSE-LABEL: @add_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = add <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -241,50 +241,50 @@ define void @add_v32i16() { ; SSE-LABEL: @add_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -429,50 +429,50 @@ define void @add_v64i8() { ; SSE-LABEL: @add_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @add_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = add <16 x i8> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = add <16 x i8> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = add <16 x i8> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @add_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = add <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = add <32 x i8> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = add <32 x i8> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll @@ -25,40 +25,40 @@ define void @sdiv_v16i32_uniformconst() { ; SSE-LABEL: @sdiv_v16i32_uniformconst( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], -; SSE-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], -; SSE-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], -; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], ; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sdiv_v16i32_uniformconst( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], -; SLM-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], -; SLM-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], -; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], +; SLM-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = sdiv <4 x i32> [[TMP3]], +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP5]], +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP7]], ; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sdiv_v16i32_uniformconst( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], -; AVX-NEXT: [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], -; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = sdiv <8 x i32> [[TMP1]], +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = sdiv <8 x i32> [[TMP3]], ; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -122,40 +122,40 @@ define void @srem_v16i32_uniformconst() { ; SSE-LABEL: @srem_v16i32_uniformconst( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], -; SSE-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], -; SSE-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], -; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], ; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @srem_v16i32_uniformconst( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], -; SLM-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], -; SLM-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], -; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = srem <4 x i32> [[TMP1]], +; SLM-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = srem <4 x i32> [[TMP3]], +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP5]], +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP7]], ; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @srem_v16i32_uniformconst( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], -; AVX-NEXT: [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], -; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = srem <8 x i32> [[TMP1]], +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = srem <8 x i32> [[TMP3]], ; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -219,40 +219,40 @@ define void @udiv_v16i32_uniformconst() { ; SSE-LABEL: @udiv_v16i32_uniformconst( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], -; SSE-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], -; SSE-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], -; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], ; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @udiv_v16i32_uniformconst( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], -; SLM-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], -; SLM-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], -; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = udiv <4 x i32> [[TMP1]], +; SLM-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = udiv <4 x i32> [[TMP3]], +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP5]], +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP7]], ; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @udiv_v16i32_uniformconst( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], -; AVX-NEXT: [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], -; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = udiv <8 x i32> [[TMP1]], +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = udiv <8 x i32> [[TMP3]], ; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -316,40 +316,40 @@ define void @urem_v16i32_uniformconst() { ; SSE-LABEL: @urem_v16i32_uniformconst( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], -; SSE-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], -; SSE-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], -; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], +; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], ; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @urem_v16i32_uniformconst( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], -; SLM-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], -; SLM-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], -; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = urem <4 x i32> [[TMP1]], +; SLM-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = urem <4 x i32> [[TMP3]], +; SLM-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP5]], +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP7]], ; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @urem_v16i32_uniformconst( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], -; AVX-NEXT: [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], -; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = urem <8 x i32> [[TMP1]], +; AVX-NEXT: store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = urem <8 x i32> [[TMP3]], ; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll @@ -28,69 +28,69 @@ define void @smul_v8i64() { ; SSE-LABEL: @smul_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smul_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX1-LABEL: @smul_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @smul_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3) -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3) -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3) +; AVX2-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3) ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void ; @@ -103,12 +103,12 @@ ; ; AVX256BW-LABEL: @smul_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3) -; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3) -; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3) +; AVX256BW-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3) ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void ; @@ -150,20 +150,20 @@ define void @smul_v16i32() { ; SSE-LABEL: @smul_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; @@ -236,12 +236,12 @@ ; ; AVX-LABEL: @smul_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -322,50 +322,50 @@ define void @smul_v32i16() { ; SSE-LABEL: @smul_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smul_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smul_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -510,50 +510,50 @@ define void @smul_v64i8() { ; SSE-LABEL: @smul_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smul_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smul_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; @@ -831,69 +831,69 @@ define void @umul_v8i64() { ; SSE-LABEL: @umul_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umul_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX1-LABEL: @umul_v8i64( ; AVX1-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]], i32 3) -; AVX1-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]], i32 3) -; AVX1-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]], i32 3) -; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]], i32 3) -; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX1-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i32 3) +; AVX1-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX1-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umul.fix.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i32 3) ; AVX1-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @umul_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3) -; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3) -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3) +; AVX2-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3) ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void ; @@ -906,12 +906,12 @@ ; ; AVX256BW-LABEL: @umul_v8i64( ; AVX256BW-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256BW-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]], i32 3) -; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]], i32 3) -; AVX256BW-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]], i32 3) +; AVX256BW-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256BW-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umul.fix.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]], i32 3) ; AVX256BW-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256BW-NEXT: ret void ; @@ -953,20 +953,20 @@ define void @umul_v16i32() { ; SSE-LABEL: @umul_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]], i32 3) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 3) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32 3) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; @@ -1039,12 +1039,12 @@ ; ; AVX-LABEL: @umul_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]], i32 3) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], i32 3) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umul.fix.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]], i32 3) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -1125,50 +1125,50 @@ define void @umul_v32i16() { ; SSE-LABEL: @umul_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umul_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]], i32 3) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i32 3) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umul.fix.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i32 3) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umul_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]], i32 3) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]], i32 3) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umul.fix.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]], i32 3) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -1313,50 +1313,50 @@ define void @umul_v64i8() { ; SSE-LABEL: @umul_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umul_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]], i32 3) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]], i32 3) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]], i32 3) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umul_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]], i32 3) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]], i32 3) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], i32 3) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umul.fix.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]], i32 3) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -95,31 +95,31 @@ ; ; AVX128-LABEL: @mul_v8i64( ; AVX128-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; AVX128-NEXT: [[TMP9:%.*]] = mul <2 x i64> [[TMP1]], [[TMP5]] -; AVX128-NEXT: [[TMP10:%.*]] = mul <2 x i64> [[TMP2]], [[TMP6]] -; AVX128-NEXT: [[TMP11:%.*]] = mul <2 x i64> [[TMP3]], [[TMP7]] -; AVX128-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[TMP4]], [[TMP8]] -; AVX128-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; AVX128-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; AVX128-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP3:%.*]] = mul <2 x i64> [[TMP1]], [[TMP2]] +; AVX128-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP6:%.*]] = mul <2 x i64> [[TMP4]], [[TMP5]] +; AVX128-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP9:%.*]] = mul <2 x i64> [[TMP7]], [[TMP8]] +; AVX128-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; AVX128-NEXT: [[TMP12:%.*]] = mul <2 x i64> [[TMP10]], [[TMP11]] ; AVX128-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; AVX128-NEXT: ret void ; ; AVX256-LABEL: @mul_v8i64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX256-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX256-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP1]], [[TMP3]] -; AVX256-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP2]], [[TMP4]] -; AVX256-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP3:%.*]] = mul <4 x i64> [[TMP1]], [[TMP2]] +; AVX256-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX256-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[TMP4]], [[TMP5]] ; AVX256-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256-NEXT: ret void ; @@ -168,69 +168,69 @@ define void @mul_v16i32() { ; SSE-LABEL: @mul_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX128-LABEL: @mul_v16i32( ; AVX128-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; AVX128-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]] -; AVX128-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]] -; AVX128-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]] -; AVX128-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]] -; AVX128-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; AVX128-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; AVX128-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP2]] +; AVX128-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP4]], [[TMP5]] +; AVX128-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP7]], [[TMP8]] +; AVX128-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; AVX128-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP10]], [[TMP11]] ; AVX128-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; AVX128-NEXT: ret void ; ; AVX256-LABEL: @mul_v16i32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] -; AVX256-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP2]], [[TMP4]] -; AVX256-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]] +; AVX256-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = mul <8 x i32> [[TMP4]], [[TMP5]] ; AVX256-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX256-NEXT: ret void ; @@ -311,69 +311,69 @@ define void @mul_v32i16() { ; SSE-LABEL: @mul_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX128-LABEL: @mul_v32i16( ; AVX128-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; AVX128-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]] -; AVX128-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]] -; AVX128-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]] -; AVX128-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]] -; AVX128-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; AVX128-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; AVX128-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP3:%.*]] = mul <8 x i16> [[TMP1]], [[TMP2]] +; AVX128-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP6:%.*]] = mul <8 x i16> [[TMP4]], [[TMP5]] +; AVX128-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP7]], [[TMP8]] +; AVX128-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; AVX128-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP10]], [[TMP11]] ; AVX128-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; AVX128-NEXT: ret void ; ; AVX256-LABEL: @mul_v32i16( ; AVX256-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX256-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX256-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX256-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX256-NEXT: [[TMP5:%.*]] = mul <16 x i16> [[TMP1]], [[TMP3]] -; AVX256-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP2]], [[TMP4]] -; AVX256-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP3:%.*]] = mul <16 x i16> [[TMP1]], [[TMP2]] +; AVX256-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX256-NEXT: [[TMP6:%.*]] = mul <16 x i16> [[TMP4]], [[TMP5]] ; AVX256-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX256-NEXT: ret void ; @@ -518,69 +518,69 @@ define void @mul_v64i8() { ; SSE-LABEL: @mul_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @mul_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX128-LABEL: @mul_v64i8( ; AVX128-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; AVX128-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP1]], [[TMP5]] -; AVX128-NEXT: [[TMP10:%.*]] = mul <16 x i8> [[TMP2]], [[TMP6]] -; AVX128-NEXT: [[TMP11:%.*]] = mul <16 x i8> [[TMP3]], [[TMP7]] -; AVX128-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP4]], [[TMP8]] -; AVX128-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; AVX128-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; AVX128-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP3:%.*]] = mul <16 x i8> [[TMP1]], [[TMP2]] +; AVX128-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP6:%.*]] = mul <16 x i8> [[TMP4]], [[TMP5]] +; AVX128-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; AVX128-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; AVX128-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] +; AVX128-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; AVX128-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; AVX128-NEXT: ret void ; ; AVX256-LABEL: @mul_v64i8( ; AVX256-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX256-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX256-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX256-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX256-NEXT: [[TMP5:%.*]] = mul <32 x i8> [[TMP1]], [[TMP3]] -; AVX256-NEXT: [[TMP6:%.*]] = mul <32 x i8> [[TMP2]], [[TMP4]] -; AVX256-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP3:%.*]] = mul <32 x i8> [[TMP1]], [[TMP2]] +; AVX256-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX256-NEXT: [[TMP6:%.*]] = mul <32 x i8> [[TMP4]], [[TMP5]] ; AVX256-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll @@ -63,31 +63,31 @@ ; ; SLM-LABEL: @smax_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smax_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -136,50 +136,50 @@ define void @smax_v16i32() { ; SSE-LABEL: @smax_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smax_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smax_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -260,50 +260,50 @@ define void @smax_v32i16() { ; SSE-LABEL: @smax_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smax_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smax_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -448,50 +448,50 @@ define void @smax_v64i8() { ; SSE-LABEL: @smax_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smax_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smax_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll @@ -63,31 +63,31 @@ ; ; SLM-LABEL: @smin_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.smin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smin_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.smin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -136,50 +136,50 @@ define void @smin_v16i32() { ; SSE-LABEL: @smin_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smin_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smin_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -260,50 +260,50 @@ define void @smin_v32i16() { ; SSE-LABEL: @smin_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smin_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smin_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -448,50 +448,50 @@ define void @smin_v64i8() { ; SSE-LABEL: @smin_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @smin_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @smin_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -98,12 +98,12 @@ ; ; AVX-LABEL: @sub_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -190,50 +190,50 @@ define void @sub_v16i32() { ; SSE-LABEL: @sub_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -314,50 +314,50 @@ define void @sub_v32i16() { ; SSE-LABEL: @sub_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -502,50 +502,50 @@ define void @sub_v64i8() { ; SSE-LABEL: @sub_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -63,12 +63,12 @@ ; ; AVX-LABEL: @sub_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -117,31 +117,31 @@ define void @sub_v16i32() { ; SSE-LABEL: @sub_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sub_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -222,31 +222,31 @@ define void @sub_v32i16() { ; SSE-LABEL: @sub_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sub_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -391,31 +391,31 @@ define void @sub_v64i8() { ; SSE-LABEL: @sub_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @sub_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -25,50 +25,50 @@ define void @sub_v8i64() { ; SSE-LABEL: @sub_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -117,50 +117,50 @@ define void @sub_v16i32() { ; SSE-LABEL: @sub_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = sub <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -241,50 +241,50 @@ define void @sub_v32i16() { ; SSE-LABEL: @sub_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP10]], [[TMP11]] ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = sub <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -429,50 +429,50 @@ define void @sub_v64i8() { ; SSE-LABEL: @sub_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sub_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP1]], [[TMP5]] -; SLM-NEXT: [[TMP10:%.*]] = sub <16 x i8> [[TMP2]], [[TMP6]] -; SLM-NEXT: [[TMP11:%.*]] = sub <16 x i8> [[TMP3]], [[TMP7]] -; SLM-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP4]], [[TMP8]] -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = sub <16 x i8> [[TMP1]], [[TMP2]] +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = sub <16 x i8> [[TMP4]], [[TMP5]] +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @sub_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = sub <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = sub <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = sub <32 x i8> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = sub <32 x i8> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll @@ -63,31 +63,31 @@ ; ; SLM-LABEL: @umax_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umax.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umax_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umax.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -136,50 +136,50 @@ define void @umax_v16i32() { ; SSE-LABEL: @umax_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umax_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umax_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umax.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -260,50 +260,50 @@ define void @umax_v32i16() { ; SSE-LABEL: @umax_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umax_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umax_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -448,50 +448,50 @@ define void @umax_v64i8() { ; SSE-LABEL: @umax_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umax_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umax_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll @@ -63,31 +63,31 @@ ; ; SLM-LABEL: @umin_v8i64( ; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]]) -; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP2]]) +; SLM-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]]) +; SLM-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]]) +; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SLM-NEXT: [[TMP12:%.*]] = call <2 x i64> @llvm.umin.v2i64(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]]) ; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umin_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP2]], <4 x i64> [[TMP4]]) -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP1]], <4 x i64> [[TMP2]]) +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP4]], <4 x i64> [[TMP5]]) ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -136,50 +136,50 @@ define void @umin_v16i32() { ; SSE-LABEL: @umin_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umin_v16i32( ; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP3]], <4 x i32> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP8]]) -; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) +; SLM-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP5]]) +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]]) +; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]]) ; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umin_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP2]], <8 x i32> [[TMP4]]) -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.umin.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP5]]) ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -260,50 +260,50 @@ define void @umin_v32i16() { ; SSE-LABEL: @umin_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umin_v32i16( ; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP3]], <8 x i16> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP8]]) -; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) +; SLM-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; SLM-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP9:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]]) +; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SLM-NEXT: [[TMP12:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]]) ; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umin_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP2]], <16 x i16> [[TMP4]]) -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> [[TMP4]], <16 x i16> [[TMP5]]) ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -448,50 +448,50 @@ define void @umin_v64i8() { ; SSE-LABEL: @umin_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; SLM-LABEL: @umin_v64i8( ; SLM-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP5]]) -; SLM-NEXT: [[TMP10:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP6]]) -; SLM-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP7]]) -; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP8]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SLM-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) +; SLM-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]]) +; SLM-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SLM-NEXT: ret void ; ; AVX-LABEL: @umin_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP3]]) -; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP2]], <32 x i8> [[TMP4]]) -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]]) +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> [[TMP4]], <32 x i8> [[TMP5]]) ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll @@ -40,10 +40,10 @@ define void @bitreverse_4i64() #0 { ; SSE-LABEL: @bitreverse_4i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]]) -; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]]) +; SSE-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]]) ; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE-NEXT: ret void ; @@ -99,10 +99,10 @@ define void @bitreverse_8i32() #0 { ; SSE-LABEL: @bitreverse_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]]) -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]]) +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP3]]) ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; @@ -182,10 +182,10 @@ define void @bitreverse_16i16() #0 { ; SSE-LABEL: @bitreverse_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]]) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]]) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP3]]) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -313,10 +313,10 @@ define void @bitreverse_32i8() #0 { ; SSE-LABEL: @bitreverse_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]]) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]]) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP3]]) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/broadcast.ll @@ -17,15 +17,15 @@ ; CHECK-NEXT: [[B0:%.*]] = load i64, i64* [[B:%.*]], align 8 ; CHECK-NEXT: [[V1:%.*]] = sub i64 [[A0]], 1 ; CHECK-NEXT: [[V2:%.*]] = sub i64 [[B0]], 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1 +; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2 +; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[V1]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[V2]], i32 0 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i64, i64* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 1 -; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 2 -; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i64, i64* [[S]], i64 3 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[IDXS0]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void @@ -66,11 +66,15 @@ ; CHECK-LABEL: @bcast_vals2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A0:%.*]] = load i16, i16* [[A:%.*]], align 8 +; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 +; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 +; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load i16, i16* [[B:%.*]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load i16, i16* [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load i16, i16* [[D:%.*]], align 8 ; CHECK-NEXT: [[E0:%.*]] = load i16, i16* [[E:%.*]], align 8 -; CHECK-NEXT: [[V1:%.*]] = sext i16 [[A0]] to i32 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[C0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[E0]], i32 2 @@ -79,10 +83,6 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[V1]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[SHUFFLE]], [[TMP4]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 1 -; CHECK-NEXT: [[IDXS2:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 2 -; CHECK-NEXT: [[IDXS3:%.*]] = getelementptr inbounds i32, i32* [[S]], i64 3 ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[IDXS0]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bswap.ll @@ -103,10 +103,10 @@ define void @bswap_8i32() #0 { ; SSE-LABEL: @bswap_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP2]]) -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP1]]) +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[TMP3]]) ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE-NEXT: ret void ; @@ -180,10 +180,10 @@ define void @bswap_16i16() #0 { ; SSE-LABEL: @bswap_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP2]]) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP1]]) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP3]]) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/combined-stores-chains.ll @@ -23,21 +23,21 @@ ; CHECK-NEXT: [[T252:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 9 ; CHECK-NEXT: [[T292:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 10 ; CHECK-NEXT: [[T322:%.*]] = getelementptr inbounds i64, i64* [[T02]], i64 11 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[T14]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[T142]] to <2 x i64>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[T142]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i64> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[T212]] to <2 x i64>* ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[T222]] to <2 x i64>* ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i64> [[TMP4]], -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[TMP6]], -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[T212]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8 -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[T292]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i64> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[T292]] to <2 x i64>* +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[T14]] to <4 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], +; CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[T21]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret void ; %t0 = bitcast i8* %v0 to i32* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -21,9 +21,9 @@ ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[MUL]], 1 ; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] @@ -85,9 +85,9 @@ ; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[MUL]], 1 ; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] @@ -139,9 +139,9 @@ ; CHECK-NEXT: [[ADD37:%.*]] = add nsw i32 [[MUL]], 3 ; CHECK-NEXT: [[IDXPROM38:%.*]] = sext i32 [[ADD37]] to i64 ; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM38]] +; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <4 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] @@ -295,9 +295,9 @@ ; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[MUL]], 7 ; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD7]] to i64 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] @@ -343,9 +343,9 @@ ; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[MUL]], 7 ; CHECK-NEXT: [[IDXPROM12:%.*]] = zext i32 [[ADD7]] to i64 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll @@ -9,13 +9,13 @@ ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -4,20 +4,12 @@ define void @exceed(double %0, double %1) { ; CHECK-LABEL: @exceed( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -27,13 +19,21 @@ ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 ; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -93,12 +93,12 @@ define void @zot(%struct.hoge* %arg) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP2]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -36,13 +36,13 @@ ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> , double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> , double [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP7]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable @@ -114,6 +114,7 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: +; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double undef, i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]] @@ -122,7 +123,6 @@ ; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]] -; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: br label [[RETURN:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -16,21 +16,21 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[G:%.*]], i64 5 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: ret i32 undef @@ -133,24 +133,24 @@ ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], ; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: br label [[TMP24:%.*]] ; CHECK: 14: ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP17:%.*]] = load double, double* [[TMP16]], align 8 ; CHECK-NEXT: [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> poison, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x double> [[TMP20]], double [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], ; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP22]], <2 x double>* [[TMP23]], align 8 ; CHECK-NEXT: br label [[TMP24]] ; CHECK: 24: ; CHECK-NEXT: ret i32 undef @@ -267,10 +267,10 @@ ; CHECK: if.end: ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds double, double* [[A]], i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[N]], 4 ; CHECK-NEXT: [[CONV12:%.*]] = sitofp i32 [[ADD]] to double +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP2]], double [[CONV12]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], [[TMP7]] ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX7]] to <2 x double>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -168,10 +168,10 @@ ; ; SSE42-LABEL: @ctlz_8i32( ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 false) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; @@ -245,10 +245,10 @@ define void @ctlz_16i16() #0 { ; SSE-LABEL: @ctlz_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 false) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 false) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -370,10 +370,10 @@ define void @ctlz_32i8() #0 { ; SSE-LABEL: @ctlz_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 false) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 false) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; @@ -630,10 +630,10 @@ ; ; SSE42-LABEL: @ctlz_undef_8i32( ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP2]], i1 true) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 true) ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; @@ -707,10 +707,10 @@ define void @ctlz_undef_16i16() #0 { ; SSE-LABEL: @ctlz_undef_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP2]], i1 true) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 true) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -832,10 +832,10 @@ define void @ctlz_undef_32i8() #0 { ; SSE-LABEL: @ctlz_undef_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP2]], i1 true) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 true) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctpop.ll @@ -63,10 +63,10 @@ define void @ctpop_4i64() #0 { ; SSE2-LABEL: @ctpop_4i64( ; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4 -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 -; SSE2-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) -; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]]) -; SSE2-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE2-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]]) +; SSE2-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4 +; SSE2-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4 +; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP3]]) ; SSE2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4 ; SSE2-NEXT: ret void ; @@ -182,10 +182,10 @@ define void @ctpop_8i32() #0 { ; SSE2-LABEL: @ctpop_8i32( ; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE2-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) -; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP2]]) -; SSE2-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP1]]) +; SSE2-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> [[TMP3]]) ; SSE2-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE2-NEXT: ret void ; @@ -313,10 +313,10 @@ define void @ctpop_16i16() #0 { ; SSE-LABEL: @ctpop_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP2]]) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP1]]) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> [[TMP3]]) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -438,10 +438,10 @@ define void @ctpop_32i8() #0 { ; SSE-LABEL: @ctpop_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP2]]) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP1]]) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP3]]) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -168,10 +168,10 @@ ; ; SSE42-LABEL: @cttz_8i32( ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 false) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 false) ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; @@ -245,10 +245,10 @@ define void @cttz_16i16() #0 { ; SSE-LABEL: @cttz_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 false) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 false) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 false) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -370,10 +370,10 @@ define void @cttz_32i8() #0 { ; SSE-LABEL: @cttz_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 false) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 false) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 false) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; @@ -630,10 +630,10 @@ ; ; SSE42-LABEL: @cttz_undef_8i32( ; SSE42-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 -; SSE42-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) -; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP2]], i1 true) -; SSE42-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 true) +; SSE42-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2 +; SSE42-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP3]], i1 true) ; SSE42-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2 ; SSE42-NEXT: ret void ; @@ -707,10 +707,10 @@ define void @cttz_undef_16i16() #0 { ; SSE-LABEL: @cttz_undef_16i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true) -; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP2]], i1 true) -; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP1]], i1 true) +; SSE-NEXT: store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> [[TMP3]], i1 true) ; SSE-NEXT: store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; @@ -832,10 +832,10 @@ define void @cttz_undef_32i8() #0 { ; SSE-LABEL: @cttz_undef_32i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true) -; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP2]], i1 true) -; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP1]], i1 true) +; SSE-NEXT: store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> [[TMP3]], i1 true) ; SSE-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond.ll @@ -21,12 +21,12 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: ret i32 0 @@ -72,12 +72,12 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 @@ -116,12 +116,12 @@ ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[MUL238]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast.ll @@ -7,10 +7,10 @@ ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/diamond_broadcast_extra_shuffle.ll @@ -7,10 +7,10 @@ ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: ret i32 0 @@ -37,10 +37,10 @@ ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: ret i32 0 @@ -67,10 +67,10 @@ ; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[LD]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[SHUFFLE]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/different-vec-widths.ll @@ -26,19 +26,19 @@ ; SSE-NEXT: [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5 ; SSE-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* ; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; SSE-NEXT: [[TMP3:%.*]] = bitcast double* [[P2]] to <2 x double>* -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; SSE-NEXT: [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>* +; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; SSE-NEXT: [[TMP4:%.*]] = bitcast double* [[Q0]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 +; SSE-NEXT: [[TMP5:%.*]] = bitcast double* [[P2]] to <2 x double>* ; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 -; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP2]], -; SSE-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP4]], -; SSE-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP6]], -; SSE-NEXT: [[TMP10:%.*]] = bitcast double* [[Q0]] to <2 x double>* -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP10]], align 8 -; SSE-NEXT: [[TMP11:%.*]] = bitcast double* [[Q2]] to <2 x double>* -; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP11]], align 8 +; SSE-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], +; SSE-NEXT: [[TMP8:%.*]] = bitcast double* [[Q2]] to <2 x double>* +; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; SSE-NEXT: [[TMP9:%.*]] = bitcast double* [[P4]] to <2 x double>* +; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 8 +; SSE-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], ; SSE-NEXT: [[TMP12:%.*]] = bitcast double* [[Q4]] to <2 x double>* -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP12]], align 8 +; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @PR28457( @@ -56,14 +56,14 @@ ; AVX-NEXT: [[Q5:%.*]] = getelementptr inbounds double, double* [[Q]], i64 5 ; AVX-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <4 x double>* ; AVX-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 8 -; AVX-NEXT: [[TMP3:%.*]] = bitcast double* [[P4]] to <2 x double>* -; AVX-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 -; AVX-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[TMP2]], -; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], -; AVX-NEXT: [[TMP7:%.*]] = bitcast double* [[Q0]] to <4 x double>* -; AVX-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP7]], align 8 +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], +; AVX-NEXT: [[TMP4:%.*]] = bitcast double* [[Q0]] to <4 x double>* +; AVX-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 +; AVX-NEXT: [[TMP5:%.*]] = bitcast double* [[P4]] to <2 x double>* +; AVX-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 8 +; AVX-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], ; AVX-NEXT: [[TMP8:%.*]] = bitcast double* [[Q4]] to <2 x double>* -; AVX-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP8]], align 8 +; AVX-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; AVX-NEXT: ret void ; %p0 = getelementptr inbounds double, double* %p, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll @@ -20,14 +20,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY]] to <2 x double>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[PTRX2]] to <2 x double>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[PTRY2]] to <2 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PTRX2]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[PTRY2]] to <2 x double>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x double> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP10]], i32 0 ; CHECK-NEXT: [[DOT012:%.*]] = fadd double [[DOT01]], [[TMP13]] @@ -71,14 +71,14 @@ ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[PTRX2]] to <2 x float>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[PTRY2]] to <2 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[PTRX2]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[PTRY2]] to <2 x float>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[TMP11]], [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 ; CHECK-NEXT: [[DOT012:%.*]] = fadd float [[DOT01]], [[TMP13]] @@ -202,11 +202,11 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2 ; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd double [[MUL0]], [[TMP6]] @@ -240,11 +240,11 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2 ; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd float [[MUL0]], [[TMP6]] @@ -278,11 +278,11 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds double, double* [[PTRY]], i64 2 ; CHECK-NEXT: [[X0:%.*]] = load double, double* [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load double, double* [[PTRY]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[PTRX1]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[PTRY1]] to <2 x double>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul double [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast double [[MUL0]], [[TMP6]] @@ -316,11 +316,11 @@ ; CHECK-NEXT: [[PTRY2:%.*]] = getelementptr inbounds float, float* [[PTRY]], i64 2 ; CHECK-NEXT: [[X0:%.*]] = load float, float* [[PTRX]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = load float, float* [[PTRY]], align 4 +; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[PTRX1]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[PTRY1]] to <2 x float>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[MUL0:%.*]] = fmul float [[X0]], [[Y0]] ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 ; CHECK-NEXT: [[DOT01:%.*]] = fadd fast float [[MUL0]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll @@ -10,11 +10,11 @@ ; CHECK-LABEL: @fn1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> ; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* ; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 8 @@ -41,18 +41,18 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP5]], i32 [[TMP6]]) -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 -; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[C]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: ret void @@ -103,11 +103,11 @@ ; CHECK-LABEL: @externally_used_ptrs( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64*> poison, i64* [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64*> [[TMP1]], i64* [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x i64*> [[TMP2]], <2 x i64> ; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x i64*> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64*> [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <2 x i64>* ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fabs.ll @@ -39,10 +39,10 @@ define void @fabs_4f64() #0 { ; SSE-LABEL: @fabs_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]]) -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]]) ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -70,25 +70,25 @@ define void @fabs_8f64() #0 { ; SSE-LABEL: @fabs_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP2]]) -; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]]) -; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP4]]) -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP5]]) +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP7]]) ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fabs_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]]) -; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP2]]) -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP1]]) +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[TMP3]]) ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -150,10 +150,10 @@ define void @fabs_8f32() #0 { ; SSE-LABEL: @fabs_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]]) ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -193,25 +193,25 @@ define void @fabs_16f32() #0 { ; SSE-LABEL: @fabs_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]]) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP2]]) -; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]]) -; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP4]]) -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP5]]) +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP7]]) ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fabs_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]]) -; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP2]]) -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP1]]) +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fabs.v8f32(<8 x float> [[TMP3]]) ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fcopysign.ll @@ -46,12 +46,12 @@ define void @fcopysign_4f64() #0 { ; SSE-LABEL: @fcopysign_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) ; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -84,31 +84,31 @@ define void @fcopysign_8f64() #0 { ; SSE-LABEL: @fcopysign_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]]) -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]]) +; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]]) ; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fcopysign_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]]) -; AVX256-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]]) +; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.copysign.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]]) ; AVX256-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -184,12 +184,12 @@ define void @fcopysign_8f32() #0 { ; SSE-LABEL: @fcopysign_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]]) -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) ; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -238,31 +238,31 @@ define void @fcopysign_16f32() #0 { ; SSE-LABEL: @fcopysign_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]]) -; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]]) +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.copysign.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]]) ; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fcopysign_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]]) -; AVX256-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]]) +; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.copysign.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]]) ; AVX256-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fma.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fma.ll @@ -160,14 +160,14 @@ ; ; FMA256-LABEL: @fma_8f64( ; FMA256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 -; FMA256-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]]) -; FMA256-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]]) -; FMA256-NEXT: store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]]) +; FMA256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 +; FMA256-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fma.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]]) ; FMA256-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; FMA256-NEXT: ret void ; @@ -458,14 +458,14 @@ ; ; FMA256-LABEL: @fma_16f32( ; FMA256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 -; FMA256-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]]) -; FMA256-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]]) -; FMA256-NEXT: store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]]) +; FMA256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 +; FMA256-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fma.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]]) ; FMA256-NEXT: store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; FMA256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll @@ -44,12 +44,12 @@ define void @fmaxnum_4f64() #0 { ; SSE-LABEL: @fmaxnum_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) ; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -82,31 +82,31 @@ define void @fmaxnum_8f64() #0 { ; SSE-LABEL: @fmaxnum_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]]) -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]]) +; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.maxnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]]) ; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fmaxnum_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]]) -; AVX256-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]]) +; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.maxnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]]) ; AVX256-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -182,12 +182,12 @@ define void @fmaxnum_8f32() #0 { ; SSE-LABEL: @fmaxnum_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]]) -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) ; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -236,31 +236,31 @@ define void @fmaxnum_16f32() #0 { ; SSE-LABEL: @fmaxnum_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]]) -; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]]) +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]]) ; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fmaxnum_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]]) -; AVX256-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]]) +; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]]) ; AVX256-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll @@ -44,12 +44,12 @@ define void @fminnum_4f64() #0 { ; SSE-LABEL: @fminnum_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]]) -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) ; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -82,31 +82,31 @@ define void @fminnum_8f64() #0 { ; SSE-LABEL: @fminnum_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]]) -; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP10]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP11]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]]) +; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]]) +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]]) +; SSE-NEXT: store <2 x double> [[TMP9]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.minnum.v2f64(<2 x double> [[TMP10]], <2 x double> [[TMP11]]) ; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fminnum_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]]) -; AVX256-NEXT: store <4 x double> [[TMP5]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]]) +; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.minnum.v4f64(<4 x double> [[TMP4]], <4 x double> [[TMP5]]) ; AVX256-NEXT: store <4 x double> [[TMP6]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -182,12 +182,12 @@ define void @fminnum_8f32() #0 { ; SSE-LABEL: @fminnum_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]]) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]]) -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) ; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -236,31 +236,31 @@ define void @fminnum_16f32() #0 { ; SSE-LABEL: @fminnum_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]]) -; SSE-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]]) -; SSE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]]) -; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]]) -; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP10]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP11]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]]) +; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]]) +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]]) +; SSE-NEXT: store <4 x float> [[TMP9]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP10]], <4 x float> [[TMP11]]) ; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fminnum_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]]) -; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]]) -; AVX256-NEXT: store <8 x float> [[TMP5]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]]) +; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[TMP4]], <8 x float> [[TMP5]]) ; AVX256-NEXT: store <8 x float> [[TMP6]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll @@ -49,14 +49,14 @@ define void @fmuladd_4f64() #0 { ; SSE-LABEL: @fmuladd_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x double> [[TMP5]]) -; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x double> [[TMP6]]) -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]) +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]]) ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -94,37 +94,37 @@ define void @fmuladd_8f64() #0 { ; SSE-LABEL: @fmuladd_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcA64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcB64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]]) +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 2) to <2 x double>*), align 4 ; SSE-NEXT: [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @srcC64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x double> [[TMP7]]) +; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <2 x double>*), align 4 ; SSE-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP12:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP5]], <2 x double> [[TMP9]]) -; SSE-NEXT: [[TMP14:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP6]], <2 x double> [[TMP10]]) -; SSE-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP3]], <2 x double> [[TMP7]], <2 x double> [[TMP11]]) -; SSE-NEXT: [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP8]], <2 x double> [[TMP12]]) -; SSE-NEXT: store <2 x double> [[TMP13]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP14]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP15]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]]) +; SSE-NEXT: store <2 x double> [[TMP12]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP15:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP16:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]]) ; SSE-NEXT: store <2 x double> [[TMP16]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fmuladd_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcA64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP3]], <4 x double> [[TMP5]]) -; AVX256-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP2]], <4 x double> [[TMP4]], <4 x double> [[TMP6]]) -; AVX256-NEXT: store <4 x double> [[TMP7]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcB64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @srcC64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP1]], <4 x double> [[TMP2]], <4 x double> [[TMP3]]) +; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcA64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcB64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP7:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @srcC64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> [[TMP5]], <4 x double> [[TMP6]], <4 x double> [[TMP7]]) ; AVX256-NEXT: store <4 x double> [[TMP8]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -214,14 +214,14 @@ define void @fmuladd_8f32() #0 { ; SSE-LABEL: @fmuladd_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP3]], <4 x float> [[TMP5]]) -; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP4]], <4 x float> [[TMP6]]) -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]]) +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]]) ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -279,37 +279,37 @@ define void @fmuladd_16f32() #0 { ; SSE-LABEL: @fmuladd_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcA32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcB32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x float> [[TMP3]]) +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: [[TMP6:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @srcC32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x float> [[TMP7]]) +; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <4 x float>*), align 4 ; SSE-NEXT: [[TMP11:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP12:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP1]], <4 x float> [[TMP5]], <4 x float> [[TMP9]]) -; SSE-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP2]], <4 x float> [[TMP6]], <4 x float> [[TMP10]]) -; SSE-NEXT: [[TMP15:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP3]], <4 x float> [[TMP7]], <4 x float> [[TMP11]]) -; SSE-NEXT: [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP8]], <4 x float> [[TMP12]]) -; SSE-NEXT: store <4 x float> [[TMP13]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP14]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP15]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]) +; SSE-NEXT: store <4 x float> [[TMP12]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP13:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP14:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP15:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP16:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]]) ; SSE-NEXT: store <4 x float> [[TMP16]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @fmuladd_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcA32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP4:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP3]], <8 x float> [[TMP5]]) -; AVX256-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP2]], <8 x float> [[TMP4]], <8 x float> [[TMP6]]) -; AVX256-NEXT: store <8 x float> [[TMP7]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcB32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @srcC32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x float> [[TMP3]]) +; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP5:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcA32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP6:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcB32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP7:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @srcC32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP5]], <8 x float> [[TMP6]], <8 x float> [[TMP7]]) ; AVX256-NEXT: store <8 x float> [[TMP8]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll @@ -82,10 +82,10 @@ ; ; AVX256DQ-LABEL: @fptosi_8f64_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -119,10 +119,10 @@ define void @fptosi_8f64_8i32() #0 { ; SSE-LABEL: @fptosi_8f64_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; @@ -315,10 +315,10 @@ ; ; AVX256DQ-LABEL: @fptosi_8f32_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -352,10 +352,10 @@ define void @fptosi_8f32_8i32() #0 { ; SSE-LABEL: @fptosi_8f32_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -82,10 +82,10 @@ ; ; AVX256DQ-LABEL: @fptosi_8f64_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -119,10 +119,10 @@ define void @fptosi_8f64_8i32() #0 { ; SSE-LABEL: @fptosi_8f64_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x double> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x double> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; @@ -315,10 +315,10 @@ ; ; AVX256DQ-LABEL: @fptosi_8f32_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -352,10 +352,10 @@ define void @fptosi_8f32_8i32() #0 { ; SSE-LABEL: @fptosi_8f32_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -82,10 +82,10 @@ ; ; AVX256DQ-LABEL: @fptoui_8f64_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -119,10 +119,10 @@ define void @fptoui_8f64_8i32() #0 { ; SSE-LABEL: @fptoui_8f64_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptoui <4 x double> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x double> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; @@ -315,10 +315,10 @@ ; ; AVX256DQ-LABEL: @fptoui_8f32_8i64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; AVX256DQ-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64> -; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i64> -; AVX256DQ-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i64> +; AVX256DQ-NEXT: store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([8 x i64]* @dst64 to <4 x i64>*), align 8 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; AVX256DQ-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i64> ; AVX256DQ-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @dst64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX256DQ-NEXT: ret void ; @@ -352,10 +352,10 @@ define void @fptoui_8f32_8i32() #0 { ; SSE-LABEL: @fptoui_8f32_8i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP2]] to <4 x i32> -; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32> +; SSE-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([16 x i32]* @dst32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = fptoui <4 x float> [[TMP3]] to <4 x i32> ; SSE-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fround.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fround.ll @@ -74,10 +74,10 @@ ; ; SSE41-LABEL: @ceil_4f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; @@ -132,34 +132,34 @@ ; ; SSE41-LABEL: @ceil_8f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP3]]) +; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP5]]) +; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP7]]) ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @ceil_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) -; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) +; AVX1-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]]) ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ceil_8f64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP2]]) -; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP1]]) +; AVX2-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[TMP3]]) ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX2-NEXT: ret void ; @@ -245,10 +245,10 @@ ; ; SSE41-LABEL: @floor_4f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; @@ -303,34 +303,34 @@ ; ; SSE41-LABEL: @floor_8f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP3]]) +; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP5]]) +; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP7]]) ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @floor_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) -; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) +; AVX1-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]]) ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @floor_8f64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP2]]) -; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP1]]) +; AVX2-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[TMP3]]) ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX2-NEXT: ret void ; @@ -416,10 +416,10 @@ ; ; SSE41-LABEL: @nearbyint_4f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; @@ -474,34 +474,34 @@ ; ; SSE41-LABEL: @nearbyint_8f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP3]]) +; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP5]]) +; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP7]]) ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @nearbyint_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) -; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) +; AVX1-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]]) ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @nearbyint_8f64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP2]]) -; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP1]]) +; AVX2-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> [[TMP3]]) ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX2-NEXT: ret void ; @@ -587,10 +587,10 @@ ; ; SSE41-LABEL: @rint_4f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; @@ -645,34 +645,34 @@ ; ; SSE41-LABEL: @rint_8f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP3]]) +; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP5]]) +; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[TMP7]]) ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @rint_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) -; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) +; AVX1-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]]) ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @rint_8f64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP2]]) -; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP1]]) +; AVX2-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.rint.v4f64(<4 x double> [[TMP3]]) ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX2-NEXT: ret void ; @@ -758,10 +758,10 @@ ; ; SSE41-LABEL: @trunc_4f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) ; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; @@ -816,34 +816,34 @@ ; ; SSE41-LABEL: @trunc_8f64( ; SSE41-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 -; SSE41-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP4]]) -; SSE41-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE41-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP1]]) +; SSE41-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP3]]) +; SSE41-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP5]]) +; SSE41-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 8 +; SSE41-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP7]]) ; SSE41-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 8 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @trunc_8f64( ; AVX1-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX1-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) -; AVX1-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) +; AVX1-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX1-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]]) ; AVX1-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @trunc_8f64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP2]]) -; AVX2-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP1]]) +; AVX2-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.trunc.v4f64(<4 x double> [[TMP3]]) ; AVX2-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 8 ; AVX2-NEXT: ret void ; @@ -953,10 +953,10 @@ ; ; SSE41-LABEL: @ceil_8f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; @@ -1047,34 +1047,34 @@ ; ; SSE41-LABEL: @ceil_16f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP3]]) +; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP5]]) +; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP7]]) ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @ceil_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) -; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) +; AVX1-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]]) ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @ceil_16f32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP2]]) -; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP1]]) +; AVX2-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[TMP3]]) ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX2-NEXT: ret void ; @@ -1208,10 +1208,10 @@ ; ; SSE41-LABEL: @floor_8f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; @@ -1302,34 +1302,34 @@ ; ; SSE41-LABEL: @floor_16f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP3]]) +; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP5]]) +; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP7]]) ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @floor_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) -; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) +; AVX1-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]]) ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @floor_16f32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP2]]) -; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP1]]) +; AVX2-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[TMP3]]) ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX2-NEXT: ret void ; @@ -1463,10 +1463,10 @@ ; ; SSE41-LABEL: @nearbyint_8f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; @@ -1557,34 +1557,34 @@ ; ; SSE41-LABEL: @nearbyint_16f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP3]]) +; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP5]]) +; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP7]]) ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @nearbyint_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) -; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) +; AVX1-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]]) ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @nearbyint_16f32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP2]]) -; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP1]]) +; AVX2-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.nearbyint.v8f32(<8 x float> [[TMP3]]) ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX2-NEXT: ret void ; @@ -1718,10 +1718,10 @@ ; ; SSE41-LABEL: @rint_8f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; @@ -1812,34 +1812,34 @@ ; ; SSE41-LABEL: @rint_16f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP3]]) +; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP5]]) +; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[TMP7]]) ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @rint_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) -; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) +; AVX1-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]]) ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @rint_16f32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP2]]) -; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP1]]) +; AVX2-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.rint.v8f32(<8 x float> [[TMP3]]) ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX2-NEXT: ret void ; @@ -1973,10 +1973,10 @@ ; ; SSE41-LABEL: @trunc_8f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) ; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; @@ -2067,34 +2067,34 @@ ; ; SSE41-LABEL: @trunc_16f32( ; SSE41-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE41-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) -; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP2]]) -; SSE41-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) -; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP4]]) -; SSE41-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE41-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP1]]) +; SSE41-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP3]]) +; SSE41-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP5]]) +; SSE41-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE41-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP7]]) ; SSE41-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE41-NEXT: ret void ; ; AVX1-LABEL: @trunc_16f32( ; AVX1-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX1-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) -; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) -; AVX1-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) +; AVX1-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX1-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]]) ; AVX1-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX1-NEXT: ret void ; ; AVX2-LABEL: @trunc_16f32( ; AVX2-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) -; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP2]]) -; AVX2-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP1]]) +; AVX2-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX2-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.trunc.v8f32(<8 x float> [[TMP3]]) ; AVX2-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX2-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/funclet.ll @@ -13,14 +13,14 @@ ; CHECK: catch: ; CHECK-NEXT: [[TMP1:%.*]] = catchpad within [[TMP0]] [i8* null, i32 64, i8* null] ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[A]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B]] to <2 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP6]]) [ "funclet"(token [[TMP1]]) ] -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[C]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: catchret from [[TMP1]] to label [[TRY_CONT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gep.ll @@ -14,12 +14,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[Y]], i64 0, i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, <2 x i32*> [[TMP5]], <2 x i64> -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* [[X]], i64 0, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, <2 x i32*> [[TMP6]], <2 x i64> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>* -; CHECK-NEXT: store <2 x i32*> [[TMP6]], <2 x i32*>* [[TMP8]], align 8 +; CHECK-NEXT: store <2 x i32*> [[TMP7]], <2 x i32*>* [[TMP8]], align 8 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds { i32*, i32* }, { i32*, i32* }* %y, i64 0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -101,11 +101,11 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float +; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16 ; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; CHECK-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 -; CHECK-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; CHECK-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] ; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] @@ -117,11 +117,11 @@ ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float +; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 +; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16 ; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2 -; THRESHOLD-NEXT: [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float ; THRESHOLD-NEXT: [[TMP4:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) ; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]] ; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[CONV6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -110,20 +110,20 @@ ; SSE-NEXT: ret i32 [[TMP23]] ; ; AVX-LABEL: @maxi8_store_in( -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; AVX-NEXT: store i32 0, i32* @var, align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) ; AVX-NEXT: ret i32 [[TMP3]] ; ; AVX2-LABEL: @maxi8_store_in( -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; AVX2-NEXT: store i32 0, i32* @var, align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; AVX2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) ; AVX2-NEXT: ret i32 [[TMP3]] ; ; THRESH-LABEL: @maxi8_store_in( -; THRESH-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; THRESH-NEXT: store i32 0, i32* @var, align 8 +; THRESH-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 ; THRESH-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP2]]) ; THRESH-NEXT: ret i32 [[TMP3]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll @@ -782,10 +782,10 @@ ; STORE-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD34]] ; STORE-NEXT: [[ADD1135:%.*]] = or i64 [[MUL]], 2 ; STORE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1135]] -; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>* -; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; STORE-NEXT: [[ADD1736:%.*]] = or i64 [[MUL]], 3 ; STORE-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1736]] +; STORE-NEXT: [[TMP1:%.*]] = bitcast float* [[B]] to <4 x float>* +; STORE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; STORE-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to <4 x float>* ; STORE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; STORE-NEXT: [[TMP5:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP4]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll @@ -51,15 +51,16 @@ ; SSE-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 2 ; SSE-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 2 ; SSE-NEXT: [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 3 +; SSE-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3 +; SSE-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3 +; SSE-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3 +; SSE-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3 ; SSE-NEXT: [[TMP4:%.*]] = bitcast i8* [[C_ADDR_0352]] to <4 x i8>* ; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1 -; SSE-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 3 ; SSE-NEXT: [[TMP6:%.*]] = bitcast i8* [[D_ADDR_0353]] to <4 x i8>* ; SSE-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1 -; SSE-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 3 ; SSE-NEXT: [[TMP8:%.*]] = bitcast i8* [[A_ADDR_0355]] to <4 x i8>* ; SSE-NEXT: [[TMP9:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1 -; SSE-NEXT: [[ARRAYIDX40:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 3 ; SSE-NEXT: [[TMP10:%.*]] = bitcast i8* [[B_ADDR_0351]] to <4 x i8>* ; SSE-NEXT: [[TMP11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP10]], align 1 ; SSE-NEXT: [[TMP12:%.*]] = icmp ult <4 x i8> [[TMP5]], [[TMP7]] @@ -67,7 +68,6 @@ ; SSE-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[TMP13]] to <4 x i32> ; SSE-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], [[SHUFFLE]] ; SSE-NEXT: [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i8> -; SSE-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 3 ; SSE-NEXT: [[TMP17:%.*]] = bitcast i8* [[E_ADDR_0354]] to <4 x i8>* ; SSE-NEXT: store <4 x i8> [[TMP16]], <4 x i8>* [[TMP17]], align 1 ; SSE-NEXT: [[ARRAYIDX45:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 4 @@ -86,15 +86,16 @@ ; SSE-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 6 ; SSE-NEXT: [[ARRAYIDX80:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 6 ; SSE-NEXT: [[ARRAYIDX81:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 7 +; SSE-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7 +; SSE-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7 +; SSE-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7 +; SSE-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7 ; SSE-NEXT: [[TMP18:%.*]] = bitcast i8* [[ARRAYIDX45]] to <4 x i8>* ; SSE-NEXT: [[TMP19:%.*]] = load <4 x i8>, <4 x i8>* [[TMP18]], align 1 -; SSE-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 7 ; SSE-NEXT: [[TMP20:%.*]] = bitcast i8* [[ARRAYIDX47]] to <4 x i8>* ; SSE-NEXT: [[TMP21:%.*]] = load <4 x i8>, <4 x i8>* [[TMP20]], align 1 -; SSE-NEXT: [[ARRAYIDX85:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 7 ; SSE-NEXT: [[TMP22:%.*]] = bitcast i8* [[ARRAYIDX49]] to <4 x i8>* ; SSE-NEXT: [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1 -; SSE-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 7 ; SSE-NEXT: [[TMP24:%.*]] = bitcast i8* [[ARRAYIDX52]] to <4 x i8>* ; SSE-NEXT: [[TMP25:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1 ; SSE-NEXT: [[TMP26:%.*]] = icmp ult <4 x i8> [[TMP19]], [[TMP21]] @@ -102,7 +103,6 @@ ; SSE-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> ; SSE-NEXT: [[TMP29:%.*]] = mul <4 x i32> [[TMP28]], [[SHUFFLE1]] ; SSE-NEXT: [[TMP30:%.*]] = trunc <4 x i32> [[TMP29]] to <4 x i8> -; SSE-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 7 ; SSE-NEXT: [[TMP31:%.*]] = bitcast i8* [[ARRAYIDX56]] to <4 x i8>* ; SSE-NEXT: store <4 x i8> [[TMP30]], <4 x i8>* [[TMP31]], align 1 ; SSE-NEXT: [[ARRAYIDX93:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 8 @@ -121,15 +121,16 @@ ; SSE-NEXT: [[ARRAYIDX124:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 10 ; SSE-NEXT: [[ARRAYIDX128:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 10 ; SSE-NEXT: [[ARRAYIDX129:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 11 +; SSE-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11 +; SSE-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11 +; SSE-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11 +; SSE-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11 ; SSE-NEXT: [[TMP32:%.*]] = bitcast i8* [[ARRAYIDX93]] to <4 x i8>* ; SSE-NEXT: [[TMP33:%.*]] = load <4 x i8>, <4 x i8>* [[TMP32]], align 1 -; SSE-NEXT: [[ARRAYIDX131:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 11 ; SSE-NEXT: [[TMP34:%.*]] = bitcast i8* [[ARRAYIDX95]] to <4 x i8>* ; SSE-NEXT: [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1 -; SSE-NEXT: [[ARRAYIDX133:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 11 ; SSE-NEXT: [[TMP36:%.*]] = bitcast i8* [[ARRAYIDX97]] to <4 x i8>* ; SSE-NEXT: [[TMP37:%.*]] = load <4 x i8>, <4 x i8>* [[TMP36]], align 1 -; SSE-NEXT: [[ARRAYIDX136:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 11 ; SSE-NEXT: [[TMP38:%.*]] = bitcast i8* [[ARRAYIDX100]] to <4 x i8>* ; SSE-NEXT: [[TMP39:%.*]] = load <4 x i8>, <4 x i8>* [[TMP38]], align 1 ; SSE-NEXT: [[TMP40:%.*]] = icmp ult <4 x i8> [[TMP33]], [[TMP35]] @@ -137,7 +138,6 @@ ; SSE-NEXT: [[TMP42:%.*]] = zext <4 x i8> [[TMP41]] to <4 x i32> ; SSE-NEXT: [[TMP43:%.*]] = mul <4 x i32> [[TMP42]], [[SHUFFLE2]] ; SSE-NEXT: [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i8> -; SSE-NEXT: [[ARRAYIDX140:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 11 ; SSE-NEXT: [[TMP45:%.*]] = bitcast i8* [[ARRAYIDX104]] to <4 x i8>* ; SSE-NEXT: store <4 x i8> [[TMP44]], <4 x i8>* [[TMP45]], align 1 ; SSE-NEXT: [[ARRAYIDX141:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 12 @@ -156,15 +156,16 @@ ; SSE-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14 ; SSE-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14 ; SSE-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15 +; SSE-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 +; SSE-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 +; SSE-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 +; SSE-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 ; SSE-NEXT: [[TMP46:%.*]] = bitcast i8* [[ARRAYIDX141]] to <4 x i8>* ; SSE-NEXT: [[TMP47:%.*]] = load <4 x i8>, <4 x i8>* [[TMP46]], align 1 -; SSE-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 ; SSE-NEXT: [[TMP48:%.*]] = bitcast i8* [[ARRAYIDX143]] to <4 x i8>* ; SSE-NEXT: [[TMP49:%.*]] = load <4 x i8>, <4 x i8>* [[TMP48]], align 1 -; SSE-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 ; SSE-NEXT: [[TMP50:%.*]] = bitcast i8* [[ARRAYIDX145]] to <4 x i8>* ; SSE-NEXT: [[TMP51:%.*]] = load <4 x i8>, <4 x i8>* [[TMP50]], align 1 -; SSE-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 ; SSE-NEXT: [[TMP52:%.*]] = bitcast i8* [[ARRAYIDX148]] to <4 x i8>* ; SSE-NEXT: [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1 ; SSE-NEXT: [[TMP54:%.*]] = icmp ult <4 x i8> [[TMP47]], [[TMP49]] @@ -172,7 +173,6 @@ ; SSE-NEXT: [[TMP56:%.*]] = zext <4 x i8> [[TMP55]] to <4 x i32> ; SSE-NEXT: [[TMP57:%.*]] = mul <4 x i32> [[TMP56]], [[SHUFFLE3]] ; SSE-NEXT: [[TMP58:%.*]] = trunc <4 x i32> [[TMP57]] to <4 x i8> -; SSE-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 ; SSE-NEXT: [[TMP59:%.*]] = bitcast i8* [[ARRAYIDX152]] to <4 x i8>* ; SSE-NEXT: store <4 x i8> [[TMP58]], <4 x i8>* [[TMP59]], align 1 ; SSE-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1 @@ -269,15 +269,16 @@ ; AVX512-NEXT: [[ARRAYIDX172:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 14 ; AVX512-NEXT: [[ARRAYIDX176:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 14 ; AVX512-NEXT: [[ARRAYIDX177:%.*]] = getelementptr inbounds i8, i8* [[C_ADDR_0352]], i64 15 +; AVX512-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 +; AVX512-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 +; AVX512-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 +; AVX512-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 ; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[C_ADDR_0352]] to <16 x i8>* ; AVX512-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1 -; AVX512-NEXT: [[ARRAYIDX179:%.*]] = getelementptr inbounds i8, i8* [[D_ADDR_0353]], i64 15 ; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[D_ADDR_0353]] to <16 x i8>* ; AVX512-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1 -; AVX512-NEXT: [[ARRAYIDX181:%.*]] = getelementptr inbounds i8, i8* [[A_ADDR_0355]], i64 15 ; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[A_ADDR_0355]] to <16 x i8>* ; AVX512-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 1 -; AVX512-NEXT: [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, i8* [[B_ADDR_0351]], i64 15 ; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[B_ADDR_0351]] to <16 x i8>* ; AVX512-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1 ; AVX512-NEXT: [[TMP9:%.*]] = icmp ult <16 x i8> [[TMP2]], [[TMP4]] @@ -285,7 +286,6 @@ ; AVX512-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> ; AVX512-NEXT: [[TMP12:%.*]] = mul <16 x i32> [[TMP11]], [[SHUFFLE]] ; AVX512-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> -; AVX512-NEXT: [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15 ; AVX512-NEXT: [[TMP14:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>* ; AVX512-NEXT: store <16 x i8> [[TMP13]], <16 x i8>* [[TMP14]], align 1 ; AVX512-NEXT: [[INC]] = add nuw nsw i32 [[I_0356]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector-inseltpoison.ll @@ -292,20 +292,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP18]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] @@ -451,14 +451,14 @@ ; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: ret <4 x float> [[TMP11]] +; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 +; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1 +; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 +; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1 +; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 +; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1 +; MINTREESIZE-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP17]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -327,20 +327,20 @@ ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[C0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[A0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[A1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[B0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[B1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP5]], <2 x float> [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <2 x i32> [[TMP10]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[A2]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[B2]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[RD1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP18]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[RD1]] @@ -486,14 +486,14 @@ ; MINTREESIZE-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[A]], i32 0 ; MINTREESIZE-NEXT: [[TMP9:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 ; MINTREESIZE-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[TMP4]], i32 1 -; MINTREESIZE-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[A]], [[B]] -; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 -; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[TMP3]], i32 1 -; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 -; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[TMP2]], i32 1 -; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 -; MINTREESIZE-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP16]], float [[TMP1]], i32 1 -; MINTREESIZE-NEXT: ret <4 x float> [[TMP11]] +; MINTREESIZE-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP7]], i32 0 +; MINTREESIZE-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP3]], i32 1 +; MINTREESIZE-NEXT: [[TMP13:%.*]] = insertelement <2 x float> poison, float [[TMP6]], i32 0 +; MINTREESIZE-NEXT: [[TMP14:%.*]] = insertelement <2 x float> [[TMP13]], float [[TMP2]], i32 1 +; MINTREESIZE-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP5]], i32 0 +; MINTREESIZE-NEXT: [[TMP16:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP1]], i32 1 +; MINTREESIZE-NEXT: [[TMP17:%.*]] = fadd <4 x float> [[A]], [[B]] +; MINTREESIZE-NEXT: ret <4 x float> [[TMP17]] ; %a0 = extractelement <4 x float> %a, i32 0 %b0 = extractelement <4 x float> %b, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-shuffle.ll @@ -9,12 +9,12 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* undef, align 4 ; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_SW:%.*]], %struct.sw* [[V:%.*]], i64 0, i32 0 ; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_SW]], %struct.sw* [[V]], i64 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[X]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* undef, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* undef, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[X]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 16 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = fmul <4 x float> [[SHUFFLE]], [[SHUFFLE1]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <4 x float> [[TMP6]], poison diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -7,14 +7,14 @@ ; CHECK-NEXT: [[PX0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[PY0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[PX1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP2]], i64 0, i64 1 +; CHECK-NEXT: [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1 +; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0 +; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[PX0]] to <2 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 4 -; CHECK-NEXT: [[PY1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP3]], i64 0, i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[PY0]] to <2 x double>* ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [2 x double], [2 x double]* [[TMP1]], i64 0, i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[PZ0]] to <2 x double>* ; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP8]], [[TMP10]] @@ -58,16 +58,16 @@ ; CHECK-NEXT: [[PX2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 2 ; CHECK-NEXT: [[PY2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 2 ; CHECK-NEXT: [[PX3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP2]], i64 0, i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[PY3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP3]], i64 0, i64 3 -; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>* -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[PZ0:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[PZ1:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 1 ; CHECK-NEXT: [[PZ2:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 2 ; CHECK-NEXT: [[PZ3:%.*]] = getelementptr inbounds [4 x float], [4 x float]* [[TMP1]], i64 0, i64 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[PX0]] to <4 x float>* +; CHECK-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[PY0]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[PZ0]] to <4 x float>* ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP8]], [[TMP10]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -8,9 +8,9 @@ ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 2 ; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 3 +; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 4 -; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic_with_scalar_param.ll @@ -8,12 +8,12 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]]) ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 1 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, float* [[C]], i32 2 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, float* [[C]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> [[TMP1]], i32 [[P:%.*]]) ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-shuffle-placement.ll @@ -22,16 +22,16 @@ ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void @@ -78,16 +78,16 @@ ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 13 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load.ll @@ -9,20 +9,20 @@ ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[SHUFFLE]] ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], <4 x i32>* [[TMP6]], align 4 @@ -67,6 +67,10 @@ ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 +; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 +; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 @@ -78,10 +82,6 @@ ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP9]], i32 3 ; CHECK-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP2]], [[TMP10]] -; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 -; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 -; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP12]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -17,36 +17,36 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 12 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 -; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 -; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 -; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 -; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 -; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 -; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 -; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @a, align 4 +; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP1]] to float +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> [[TMP5]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP6]] to <2 x float> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> , [[TMP8]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: store float [[TMP10]], float* @g, align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP11]], i32 2 +; CHECK-NEXT: store float [[TMP12]], float* @c, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP11]], i32 0 +; CHECK-NEXT: store float [[TMP13]], float* @d, align 4 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP11]], i32 3 +; CHECK-NEXT: store float [[TMP14]], float* @e, align 4 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i32 1 +; CHECK-NEXT: store float [[TMP15]], float* @f, align 4 ; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 -; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP11]], [[TMP18]] ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -148,17 +148,17 @@ ; CHECK-NEXT: [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -148,17 +148,17 @@ ; CHECK-NEXT: [[Q3:%.*]] = getelementptr inbounds i64, i64* [[Q]], i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[P0]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[P2]] to <2 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[Q0]] to <2 x i64>* -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 2 -; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP6]] -; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[P2]] to <2 x i64>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[Q2]] to <2 x i64>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[TMP8]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <2 x i64> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 ; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds i32, i32* [[R:%.*]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP10]], i32 0 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i32, i32* [[R]], i64 [[TMP13]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -237,6 +237,8 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 @@ -254,8 +256,6 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 @@ -329,6 +329,8 @@ ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, double* [[B]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, double* [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[B0:%.*]] = load double, double* [[IDXB0]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, double* [[IDXC0]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, double* [[IDXD0]], align 8 @@ -346,8 +348,6 @@ ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[B1]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP9]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP10]], <2 x double>* [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 @@ -416,6 +416,8 @@ ; CHECK-LABEL: @lookahead_crash( ; CHECK-NEXT: [[IDXA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0 ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 +; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDXA0]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[C0:%.*]] = call double @_ZN1i2ayEv(%Class* [[ARG0:%.*]]) @@ -423,8 +425,6 @@ ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[C1]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[IDXS0:%.*]] = getelementptr inbounds double, double* [[S:%.*]], i64 0 -; CHECK-NEXT: [[IDXS1:%.*]] = getelementptr inbounds double, double* [[S]], i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[IDXS0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void @@ -457,6 +457,8 @@ ; CHECK-NEXT: [[LOADA1:%.*]] = load double, double* [[IDX1]], align 4 ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 +; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[LOADA0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[LOADA0]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[LOADVEC]], [[TMP2]] @@ -464,8 +466,6 @@ ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[LOADA1]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[LOADVEC2]], [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 ; CHECK-NEXT: ret void @@ -591,8 +591,6 @@ ; CHECK-LABEL: @ChecksExtractScores_different_vectors( ; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds double, double* [[ARRAY:%.*]], i64 0 ; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[ARRAY]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: [[LOADVEC:%.*]] = load <2 x double>, <2 x double>* [[VECPTR1:%.*]], align 4 ; CHECK-NEXT: [[LOADVEC2:%.*]] = load <2 x double>, <2 x double>* [[VECPTR2:%.*]], align 4 ; CHECK-NEXT: [[EXTRA0:%.*]] = extractelement <2 x double> [[LOADVEC]], i32 0 @@ -601,6 +599,10 @@ ; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 ; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 ; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 +; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 +; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[IDX0]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] @@ -609,8 +611,6 @@ ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP8]] -; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 -; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/metadata.ll @@ -34,12 +34,12 @@ define void @test2(double* %a, double* %b, i8* %e) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[C:%.*]] = bitcast i8* [[E:%.*]] to double* ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]], !fpmath !5 -; CHECK-NEXT: [[C:%.*]] = bitcast i8* [[E:%.*]] to double* ; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll @@ -28,11 +28,11 @@ ; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 (...) @foo() ; CHECK-NEXT: br label [[TMP7]] ; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8 -; CHECK-NEXT: [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 8 +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP10:%.*]] = fpext <2 x float> [[TMP9]] to <2 x double> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>* +; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP8]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8 ; CHECK-NEXT: ret i32 undef ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll @@ -9,16 +9,16 @@ define void @test(double* %i1, double* %i2, double* %o) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[I1_0:%.*]] = load double, double* [[I1:%.*]], align 16 -; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, double* [[I1]], i64 1 +; CHECK-NEXT: [[I1_GEP1:%.*]] = getelementptr double, double* [[I1:%.*]], i64 1 +; CHECK-NEXT: [[I1_0:%.*]] = load double, double* [[I1]], align 16 ; CHECK-NEXT: [[I1_1:%.*]] = load double, double* [[I1_GEP1]], align 16 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[I1_0]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[I1_1]], i32 1 ; CHECK-NEXT: br i1 undef, label [[THEN:%.*]], label [[END:%.*]] ; CHECK: then: ; CHECK-NEXT: [[I2_GEP0:%.*]] = getelementptr inbounds double, double* [[I2:%.*]], i64 0 -; CHECK-NEXT: [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16 ; CHECK-NEXT: [[I2_GEP1:%.*]] = getelementptr inbounds double, double* [[I2]], i64 1 +; CHECK-NEXT: [[I2_0:%.*]] = load double, double* [[I2_GEP0]], align 16 ; CHECK-NEXT: [[I2_1:%.*]] = load double, double* [[I2_GEP1]], align 16 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[I2_0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[I2_1]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2div.ll @@ -13,14 +13,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -101,14 +101,14 @@ ; AVX-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; AVX-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; AVX-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; AVX-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; AVX-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; AVX-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; AVX-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; AVX-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; AVX-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; AVX-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], -; AVX-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; AVX-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; AVX-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; AVX-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/powof2mul.ll @@ -13,14 +13,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -68,14 +68,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -123,14 +123,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -178,14 +178,14 @@ ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP4]], -; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: ret void @@ -278,16 +278,16 @@ ; AVX-NEXT: [[GEP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 7 ; AVX-NEXT: [[TMP0:%.*]] = bitcast i64* [[A]] to <4 x i64>* ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>* -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 -; AVX-NEXT: [[TMP4:%.*]] = mul <4 x i64> [[TMP1]], -; AVX-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], -; AVX-NEXT: [[TMP6:%.*]] = add <4 x i64> [[TMP4]], -; AVX-NEXT: [[TMP7:%.*]] = add <4 x i64> [[TMP5]], -; AVX-NEXT: [[TMP8:%.*]] = bitcast i64* [[A]] to <4 x i64>* -; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* [[TMP8]], align 8 +; AVX-NEXT: [[TMP2:%.*]] = mul <4 x i64> [[TMP1]], +; AVX-NEXT: [[TMP3:%.*]] = add <4 x i64> [[TMP2]], +; AVX-NEXT: [[TMP4:%.*]] = bitcast i64* [[A]] to <4 x i64>* +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* [[TMP4]], align 8 +; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>* +; AVX-NEXT: [[TMP6:%.*]] = load <4 x i64>, <4 x i64>* [[TMP5]], align 8 +; AVX-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = add <4 x i64> [[TMP7]], ; AVX-NEXT: [[TMP9:%.*]] = bitcast i64* [[GEP4]] to <4 x i64>* -; AVX-NEXT: store <4 x i64> [[TMP7]], <4 x i64>* [[TMP9]], align 8 +; AVX-NEXT: store <4 x i64> [[TMP8]], <4 x i64>* [[TMP9]], align 8 ; AVX-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -35,14 +35,14 @@ ; AVX-NEXT: store i64 [[OR_1]], i64* undef, align 8 ; AVX-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; AVX-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 +; AVX-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; AVX-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 +; AVX-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 ; AVX-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* ; AVX-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; AVX-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 ; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[OR_1]], i32 0 ; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 ; AVX-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] -; AVX-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 -; AVX-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 ; AVX-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* ; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 ; AVX-NEXT: ret void @@ -72,22 +72,22 @@ ; SSE-NEXT: [[ADD:%.*]] = add i64 undef, undef ; SSE-NEXT: store i64 [[ADD]], i64* undef, align 1 ; SSE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; SSE-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; SSE-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; SSE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 ; SSE-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; SSE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 ; SSE-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 -; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], -; SSE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; SSE-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; SSE-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 +; SSE-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; SSE-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 +; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], +; SSE-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], ; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; SSE-NEXT: ret void @@ -98,22 +98,22 @@ ; AVX-NEXT: [[ADD:%.*]] = add i64 undef, undef ; AVX-NEXT: store i64 [[ADD]], i64* undef, align 1 ; AVX-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; AVX-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 ; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 1 ; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], -; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; AVX-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 ; AVX-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; AVX-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], -; AVX-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; AVX-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; AVX-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP6]], align 1 +; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i32 0 +; AVX-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> [[TMP7]], i64 [[ADD]], i32 1 +; AVX-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = and <2 x i64> [[TMP9]], ; AVX-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] +; AVX-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] ; AVX-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* ; AVX-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 ; AVX-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -68,16 +68,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -86,16 +86,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -200,28 +200,28 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -229,28 +229,28 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -303,19 +303,19 @@ ; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 ; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 ; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], ; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 ; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 @@ -511,26 +511,26 @@ ; ; AVX512VL-LABEL: @gather_load_4( ; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 ; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], ; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 ; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], ; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] @@ -586,109 +586,109 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] -; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 +; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] ; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2 ; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 ; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>* ; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 ; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* @@ -696,52 +696,52 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 ; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -68,16 +68,16 @@ ; ; AVX-LABEL: @gather_load_2( ; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 ; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -86,16 +86,16 @@ ; ; AVX2-LABEL: @gather_load_2( ; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1 -; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 -; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 -; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1 +; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2 ; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 ; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], ; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>* @@ -200,28 +200,28 @@ ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_3( -; AVX-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 +; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 +; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 +; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 +; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 +; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 +; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 ; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -229,28 +229,28 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_3( -; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 -; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 -; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1 -; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP7]], i64 2 -; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP9]], i64 3 -; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP11]], i64 4 -; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP13]], i64 5 -; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP15]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 +; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0 +; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1 +; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2 +; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3 +; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4 +; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5 +; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6 ; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7 ; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>* @@ -303,19 +303,19 @@ ; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 ; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 ; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 +; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP7]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], ; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2 ; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 ; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3 @@ -511,26 +511,26 @@ ; ; AVX512VL-LABEL: @gather_load_4( ; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 +; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 ; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 ; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 ; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 ; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], ; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 ; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 ; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 ; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], ; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]] @@ -586,109 +586,109 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( -; SSE-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; SSE-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; SSE-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; SSE-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> [[TMP22]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP26:%.*]] = fdiv <4 x float> [[TMP21]], [[TMP25]] -; SSE-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4 +; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3 +; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] ; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>* -; SSE-NEXT: store <4 x float> [[TMP26]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; SSE-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; SSE-NEXT: [[TMP32:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; SSE-NEXT: [[TMP34:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; SSE-NEXT: [[TMP36:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP37]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP41]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP43:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP43]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP34]], i64 1 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP38]], i64 2 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP42]], i64 3 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP32]], i64 0 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP36]], i64 1 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP40]], i64 2 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2 ; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3 ; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]] -; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP27]] to <4 x float>* +; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>* ; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( -; AVX-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 ; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* @@ -696,52 +696,52 @@ ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( -; AVX2-NEXT: [[TMP3:%.*]] = load float, float* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 4 -; AVX2-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 -; AVX2-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 -; AVX2-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 -; AVX2-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 -; AVX2-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 -; AVX2-NEXT: [[TMP15:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 -; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 -; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 -; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 -; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 -; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 -; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 -; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17 +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5 +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23 +; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6 ; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 ; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_horcost.ll @@ -20,22 +20,22 @@ ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP1]], 5 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP1]], 2 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP1]], 6 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]] +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 ; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP1]], 3 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP1]], 7 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP1]], 7 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]] ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16 ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_diamond_match.ll @@ -7,33 +7,33 @@ ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* undef, i64 5 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* undef, i64 6 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* undef, i64 7 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* -; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i8> [[TMP6]] to <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]] -; CHECK-NEXT: [[TMP9:%.*]] = shl nsw <4 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP10]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP15]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP10]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <4 x i32> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP19]], <4 x i32> [[TMP20]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2 -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP21]] -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP21]] -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i32> [[TMP25]], <4 x i32> [[TMP26]], <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3 -; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP27]], <4 x i32>* [[TMP29]], align 16 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [4 x [4 x i32]], [4 x [4 x i32]]* undef, i64 0, i64 1, i64 3 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = sub nsw <4 x i32> zeroinitializer, [[SHUFFLE]] +; CHECK-NEXT: [[TMP13:%.*]] = shl nsw <4 x i32> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP21]], i32 3 +; CHECK-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw <4 x i32> [[TMP14]], [[TMP22]] +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP24]], <4 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> zeroinitializer, [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP25]] +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP27]], <4 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 16 ; CHECK-NEXT: ret void ; %1 = getelementptr inbounds i8, i8* undef, i64 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -9,8 +9,6 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 undef, label [[IF_END50_I:%.*]], label [[IF_THEN22_I:%.*]] ; CHECK: if.then22.i: -; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 -; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX_I_I7_1_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX_I_I7_2_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 2 @@ -19,20 +17,23 @@ ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 +; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 +; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 +; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 +; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 +; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 +; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], -; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 ; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 -; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], @@ -46,7 +47,6 @@ ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> ; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], -; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 ; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* ; CHECK-NEXT: store <16 x i8> [[TMP17]], <16 x i8>* [[TMP18]], align 1 ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/return.ll b/llvm/test/Transforms/SLPVectorizer/X86/return.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/return.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/return.ll @@ -44,9 +44,9 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i32 2 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[X]], i32 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[X]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[X]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX1]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll @@ -16,12 +16,12 @@ ; CHECK-NEXT: br label [[T37:%.*]] ; CHECK: t37: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x float> , [[TMP6]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1 ; CHECK-NEXT: [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2 ; CHECK-NEXT: [[T33:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 3 +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x float> , [[TMP6]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll @@ -53,11 +53,11 @@ ; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1 ; CHECK-NEXT: [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2 ; CHECK-NEXT: [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1 ; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2 ; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[D]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scheduling.ll @@ -19,22 +19,22 @@ ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[TMP1]], 5 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP1]], 2 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP1]], 6 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]] +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 ; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP1]], 3 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[TMP1]], 7 -; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP10]] +; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP1]], 7 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP8]] +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX2]] to <4 x i32>* ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP9]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 2 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP10]] ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX6]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 16 ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -93,12 +93,12 @@ ; ; AVX2-LABEL: @ashr_v8i64( ; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX2-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] -; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] -; AVX2-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]] +; AVX2-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX2-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]] ; AVX2-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX2-NEXT: ret void ; @@ -111,12 +111,12 @@ ; ; XOP-LABEL: @ashr_v8i64( ; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP5:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = ashr <4 x i64> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP6:%.*]] = ashr <4 x i64> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; XOP-NEXT: ret void ; @@ -158,31 +158,31 @@ define void @ashr_v16i32() { ; SSE-LABEL: @ashr_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = ashr <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = ashr <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = ashr <4 x i32> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = ashr <4 x i32> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ashr_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -195,12 +195,12 @@ ; ; XOP-LABEL: @ashr_v16i32( ; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP5:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = ashr <8 x i32> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP6:%.*]] = ashr <8 x i32> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; XOP-NEXT: ret void ; @@ -274,31 +274,31 @@ define void @ashr_v32i16() { ; SSE-LABEL: @ashr_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = ashr <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = ashr <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = ashr <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = ashr <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = ashr <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = ashr <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ashr_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -311,12 +311,12 @@ ; ; XOP-LABEL: @ashr_v32i16( ; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP5:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = ashr <16 x i16> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP6:%.*]] = ashr <16 x i16> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; XOP-NEXT: ret void ; @@ -454,31 +454,31 @@ define void @ashr_v64i8() { ; SSE-LABEL: @ashr_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = ashr <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = ashr <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = ashr <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = ashr <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @ashr_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; @@ -491,12 +491,12 @@ ; ; XOP-LABEL: @ashr_v64i8( ; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP5:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP3:%.*]] = ashr <32 x i8> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP6:%.*]] = ashr <32 x i8> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; XOP-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -23,31 +23,31 @@ define void @lshr_v8i64() { ; SSE-LABEL: @lshr_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = lshr <2 x i64> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -60,12 +60,12 @@ ; ; XOP-LABEL: @lshr_v8i64( ; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP6:%.*]] = lshr <4 x i64> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; XOP-NEXT: ret void ; @@ -107,31 +107,31 @@ define void @lshr_v16i32() { ; SSE-LABEL: @lshr_v16i32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 -; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = lshr <4 x i32> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <4 x i32> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 -; SSE-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = lshr <4 x i32> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP9:%.*]] = lshr <4 x i32> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -144,12 +144,12 @@ ; ; XOP-LABEL: @lshr_v16i32( ; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = lshr <8 x i32> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; XOP-NEXT: ret void ; @@ -223,31 +223,31 @@ define void @lshr_v32i16() { ; SSE-LABEL: @lshr_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = lshr <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = lshr <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = lshr <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = lshr <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -260,12 +260,12 @@ ; ; XOP-LABEL: @lshr_v32i16( ; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP5:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = lshr <16 x i16> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP6:%.*]] = lshr <16 x i16> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; XOP-NEXT: ret void ; @@ -403,31 +403,31 @@ define void @lshr_v64i8() { ; SSE-LABEL: @lshr_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = lshr <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = lshr <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = lshr <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = lshr <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @lshr_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; @@ -440,12 +440,12 @@ ; ; XOP-LABEL: @lshr_v64i8( ; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP5:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP3:%.*]] = lshr <32 x i8> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP6:%.*]] = lshr <32 x i8> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; XOP-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -23,31 +23,31 @@ define void @shl_v8i64() { ; SSE-LABEL: @shl_v8i64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 -; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = shl <2 x i64> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 -; SSE-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <2 x i64> [[TMP6]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP9:%.*]] = shl <2 x i64> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP11:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8 +; SSE-NEXT: [[TMP12:%.*]] = shl <2 x i64> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v8i64( ; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; AVX-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; AVX-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; AVX-NEXT: ret void ; @@ -60,12 +60,12 @@ ; ; XOP-LABEL: @shl_v8i64( ; XOP-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 -; XOP-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <4 x i64> [[TMP5]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @b64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP3:%.*]] = shl <4 x i64> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <4 x i64> [[TMP3]], <4 x i64>* bitcast ([8 x i64]* @c64 to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP4:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP5:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <4 x i64>*), align 8 +; XOP-NEXT: [[TMP6:%.*]] = shl <4 x i64> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <4 x i64> [[TMP6]], <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <4 x i64>*), align 8 ; XOP-NEXT: ret void ; @@ -174,12 +174,12 @@ ; ; AVX-LABEL: @shl_v16i32( ; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; AVX-NEXT: ret void ; @@ -192,12 +192,12 @@ ; ; XOP-LABEL: @shl_v16i32( ; XOP-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 -; XOP-NEXT: [[TMP5:%.*]] = shl <8 x i32> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <8 x i32> [[TMP5]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @b32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP3:%.*]] = shl <8 x i32> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP4:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP5:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <8 x i32>*), align 4 +; XOP-NEXT: [[TMP6:%.*]] = shl <8 x i32> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <8 x i32> [[TMP6]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 ; XOP-NEXT: ret void ; @@ -271,31 +271,31 @@ define void @shl_v32i16() { ; SSE-LABEL: @shl_v32i16( ; SSE-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 -; SSE-NEXT: [[TMP9:%.*]] = shl <8 x i16> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = shl <8 x i16> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = shl <8 x i16> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = shl <8 x i16> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 -; SSE-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP3:%.*]] = shl <8 x i16> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP6:%.*]] = shl <8 x i16> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <8 x i16> [[TMP6]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP9:%.*]] = shl <8 x i16> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2 +; SSE-NEXT: [[TMP12:%.*]] = shl <8 x i16> [[TMP10]], [[TMP11]] ; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v32i16( ; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; AVX-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; AVX-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; AVX-NEXT: ret void ; @@ -308,12 +308,12 @@ ; ; XOP-LABEL: @shl_v32i16( ; XOP-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP3:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 -; XOP-NEXT: [[TMP5:%.*]] = shl <16 x i16> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <16 x i16> [[TMP5]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @b16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <16 x i16> [[TMP3]], <16 x i16>* bitcast ([32 x i16]* @c16 to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP4:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP5:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <16 x i16>*), align 2 +; XOP-NEXT: [[TMP6:%.*]] = shl <16 x i16> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <16 x i16> [[TMP6]], <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <16 x i16>*), align 2 ; XOP-NEXT: ret void ; @@ -451,31 +451,31 @@ define void @shl_v64i8() { ; SSE-LABEL: @shl_v64i8( ; SSE-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @a8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 -; SSE-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP1]], [[TMP5]] -; SSE-NEXT: [[TMP10:%.*]] = shl <16 x i8> [[TMP2]], [[TMP6]] -; SSE-NEXT: [[TMP11:%.*]] = shl <16 x i8> [[TMP3]], [[TMP7]] -; SSE-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP4]], [[TMP8]] -; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP10]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 -; SSE-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([64 x i8]* @b8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP3:%.*]] = shl <16 x i8> [[TMP1]], [[TMP2]] +; SSE-NEXT: store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([64 x i8]* @c8 to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP6:%.*]] = shl <16 x i8> [[TMP4]], [[TMP5]] +; SSE-NEXT: store <16 x i8> [[TMP6]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 16) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 48) to <16 x i8>*), align 1 +; SSE-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP10]], [[TMP11]] +; SSE-NEXT: store <16 x i8> [[TMP9]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <16 x i8>*), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], <16 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 48) to <16 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @shl_v64i8( ; AVX-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; AVX-NEXT: [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]] -; AVX-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]] -; AVX-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]] +; AVX-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; AVX-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]] ; AVX-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; AVX-NEXT: ret void ; @@ -488,12 +488,12 @@ ; ; XOP-LABEL: @shl_v64i8( ; XOP-NEXT: [[TMP1:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @a8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP3:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 -; XOP-NEXT: [[TMP5:%.*]] = shl <32 x i8> [[TMP1]], [[TMP3]] -; XOP-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP2]], [[TMP4]] -; XOP-NEXT: store <32 x i8> [[TMP5]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP2:%.*]] = load <32 x i8>, <32 x i8>* bitcast ([64 x i8]* @b8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP3:%.*]] = shl <32 x i8> [[TMP1]], [[TMP2]] +; XOP-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* bitcast ([64 x i8]* @c8 to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP4:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @a8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP5:%.*]] = load <32 x i8>, <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @b8, i32 0, i64 32) to <32 x i8>*), align 1 +; XOP-NEXT: [[TMP6:%.*]] = shl <32 x i8> [[TMP4]], [[TMP5]] ; XOP-NEXT: store <32 x i8> [[TMP6]], <32 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @c8, i32 0, i64 32) to <32 x i8>*), align 1 ; XOP-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -6,19 +6,19 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 +; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4 +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5 +; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 8 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[PTR1:%.*]], i32 3 ; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[SHRINK_SHUFFLE]], ; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 4 -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 5 ; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], poison ; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> poison, <4 x i32> [[SHUFFLE1]] ; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> poison, <4 x i32> zeroinitializer, <4 x i32> [[TMP4]] -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, i32* [[PTR1]], i32 6 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP27]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll @@ -12,13 +12,13 @@ ; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[TMP10:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[I_019]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10]] = add i64 [[I_019]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[TMP10]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simplebb.ll @@ -33,12 +33,12 @@ ; Simple 3-pair chain with loads and stores, obfuscated with bitcasts define void @test2(double* %a, double* %b, i8* %e) { ; CHECK-LABEL: @test2( +; CHECK-NEXT: [[C:%.*]] = bitcast i8* [[E:%.*]] to double* ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[C:%.*]] = bitcast i8* [[E:%.*]] to double* ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[C]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 ; CHECK-NEXT: ret void @@ -62,11 +62,11 @@ ; Don't vectorize volatile loads. define void @test_volatile_load(double* %a, double* %b, double* %c) { ; CHECK-LABEL: @test_volatile_load( -; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A:%.*]], align 8 -; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[I0:%.*]] = load volatile double, double* [[A]], align 8 +; CHECK-NEXT: [[I1:%.*]] = load volatile double, double* [[B]], align 8 ; CHECK-NEXT: [[I3:%.*]] = load double, double* [[ARRAYIDX3]], align 8 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: [[I4:%.*]] = load double, double* [[ARRAYIDX4]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[I0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[I3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -181,10 +181,10 @@ ; ; AVX256DQ-LABEL: @sitofp_8i64_8f64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; AVX256DQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX256DQ-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double> ; AVX256DQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256DQ-NEXT: ret void ; @@ -234,10 +234,10 @@ define void @sitofp_4i32_4f64() #0 { ; SSE-LABEL: @sitofp_4i32_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -265,25 +265,25 @@ define void @sitofp_8i32_8f64() #0 { ; SSE-LABEL: @sitofp_8i32_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -339,10 +339,10 @@ define void @sitofp_4i16_4f64() #0 { ; SSE-LABEL: @sitofp_4i16_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -370,25 +370,25 @@ define void @sitofp_8i16_8f64() #0 { ; SSE-LABEL: @sitofp_8i16_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -444,10 +444,10 @@ define void @sitofp_4i8_4f64() #0 { ; SSE-LABEL: @sitofp_4i8_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -475,25 +475,25 @@ define void @sitofp_8i8_8f64() #0 { ; SSE-LABEL: @sitofp_8i8_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -578,10 +578,10 @@ define void @sitofp_8i64_8f32() #0 { ; SSE-LABEL: @sitofp_8i64_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -643,10 +643,10 @@ define void @sitofp_8i32_8f32() #0 { ; SSE-LABEL: @sitofp_8i32_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -686,25 +686,25 @@ define void @sitofp_16i32_16f32() #0 { ; SSE-LABEL: @sitofp_16i32_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -790,10 +790,10 @@ define void @sitofp_8i16_8f32() #0 { ; SSE-LABEL: @sitofp_8i16_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -833,25 +833,25 @@ define void @sitofp_16i16_16f32() #0 { ; SSE-LABEL: @sitofp_16i16_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -937,10 +937,10 @@ define void @sitofp_8i8_8f32() #0 { ; SSE-LABEL: @sitofp_8i8_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -980,25 +980,25 @@ define void @sitofp_16i8_16f32() #0 { ; SSE-LABEL: @sitofp_16i8_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -181,10 +181,10 @@ ; ; AVX256DQ-LABEL: @sitofp_8i64_8f64( ; AVX256DQ-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256DQ-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; AVX256DQ-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x double> -; AVX256DQ-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX256DQ-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256DQ-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256DQ-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x double> ; AVX256DQ-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256DQ-NEXT: ret void ; @@ -234,10 +234,10 @@ define void @sitofp_4i32_4f64() #0 { ; SSE-LABEL: @sitofp_4i32_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -265,25 +265,25 @@ define void @sitofp_8i32_8f64() #0 { ; SSE-LABEL: @sitofp_8i32_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i32> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -339,10 +339,10 @@ define void @sitofp_4i16_4f64() #0 { ; SSE-LABEL: @sitofp_4i16_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -370,25 +370,25 @@ define void @sitofp_8i16_8f64() #0 { ; SSE-LABEL: @sitofp_8i16_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i16> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i16> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -444,10 +444,10 @@ define void @sitofp_4i8_4f64() #0 { ; SSE-LABEL: @sitofp_4i8_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -475,25 +475,25 @@ define void @sitofp_8i8_8f64() #0 { ; SSE-LABEL: @sitofp_8i8_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i8> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <2 x i8> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -578,10 +578,10 @@ define void @sitofp_8i64_8f32() #0 { ; SSE-LABEL: @sitofp_8i64_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i64> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i64> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -643,10 +643,10 @@ define void @sitofp_8i32_8f32() #0 { ; SSE-LABEL: @sitofp_8i32_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -686,25 +686,25 @@ define void @sitofp_16i32_16f32() #0 { ; SSE-LABEL: @sitofp_16i32_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i32> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i32> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i32> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i32> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -790,10 +790,10 @@ define void @sitofp_8i16_8f32() #0 { ; SSE-LABEL: @sitofp_8i16_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -833,25 +833,25 @@ define void @sitofp_16i16_16f32() #0 { ; SSE-LABEL: @sitofp_16i16_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i16> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i16> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i16> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i16> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i16> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i16> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -937,10 +937,10 @@ define void @sitofp_8i8_8f32() #0 { ; SSE-LABEL: @sitofp_8i8_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -980,25 +980,25 @@ define void @sitofp_16i8_16f32() #0 { ; SSE-LABEL: @sitofp_16i8_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i8> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = sitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = sitofp <4 x i8> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = sitofp <4 x i8> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = sitofp <4 x i8> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i8> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = sitofp <8 x i8> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = sitofp <8 x i8> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -7,27 +7,28 @@ ; CHECK-LABEL: @_Z4testP1S( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 1, i64 0 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 1 -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 2 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 3 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 4 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 5 -; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13 ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 6 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14 ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 1, i64 7 +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 15 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 7 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 6 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 4 +; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 12 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 13 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 14 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 2, i64 5 @@ -41,7 +42,6 @@ ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32*> [[TMP8]], i32* [[ARRAYIDX48]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef) ; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP1]] -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[ARRAYIDX2]] to <8 x i32>* ; CHECK-NEXT: store <8 x i32> [[TMP11]], <8 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret void @@ -128,15 +128,15 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G20]] to <4 x i32>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> poison, <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> @@ -199,21 +199,21 @@ ; CHECK-NEXT: [[G22:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 14 ; CHECK-NEXT: [[G23:%.*]] = getelementptr inbounds [16 x i32], [16 x i32]* [[P4]], i32 0, i64 15 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P:%.*]], i64 0, i32 0, i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>* -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 3 ; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 6 +; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[G10]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G12]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[G20]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[G22]] to <2 x i32>* ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 0, i32 0, i64 7 ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sqrt.ll @@ -37,10 +37,10 @@ define void @sqrt_4f64() #0 { ; SSE-LABEL: @sqrt_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 8 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 8 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]]) ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 8 ; SSE-NEXT: ret void ; @@ -68,25 +68,25 @@ define void @sqrt_8f64() #0 { ; SSE-LABEL: @sqrt_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast ([8 x double]* @src64 to <2 x double>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) -; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) -; SSE-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]]) -; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP4]]) -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP3]]) +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP5]]) +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6) to <2 x double>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP7]]) ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sqrt_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x double>, <4 x double>* bitcast ([8 x double]* @src64 to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]]) -; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP2]]) -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP1]]) +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x double>, <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4) to <4 x double>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[TMP3]]) ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 4 ; AVX256-NEXT: ret void ; @@ -148,10 +148,10 @@ define void @sqrt_8f32() #0 { ; SSE-LABEL: @sqrt_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) -; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]]) -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]]) ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; @@ -191,25 +191,25 @@ define void @sqrt_16f32() #0 { ; SSE-LABEL: @sqrt_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([16 x float]* @src32 to <4 x float>*), align 4 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) -; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP2]]) -; SSE-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]]) -; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP4]]) -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP3]]) +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP5]]) +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x float>, <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 12) to <4 x float>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP7]]) ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 4 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @sqrt_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]]) -; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP2]]) -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP1]]) +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x float>, <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 8) to <8 x float>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.sqrt.v8f32(<8 x float> [[TMP3]]) ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 4 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/store-jumbled.ll @@ -9,19 +9,19 @@ ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0 ; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[SHUFFLE]], <4 x i32>* [[TMP6]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -4,26 +4,26 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) { ; CHECK-LABEL: @non-ordered-stores( ; CHECK-NEXT: [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 ; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 ; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 ; CHECK-NEXT: [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0 -; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 ; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1 -; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 ; CHECK-NEXT: [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2 -; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 -; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 +; CHECK-NEXT: [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4 +; CHECK-NEXT: [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4 +; CHECK-NEXT: [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4 +; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4 +; CHECK-NEXT: [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4 +; CHECK-NEXT: [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4 +; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores_vectorize.ll @@ -90,13 +90,13 @@ ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 10 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5 ; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[P3]] to <4 x i64>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 11 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[ARRAYIDX1]] to <4 x i64>* ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = shl <4 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 4 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[ARRAYIDX14]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP5]], align 8 @@ -148,9 +148,6 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 @@ -167,6 +164,9 @@ ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: ret void @@ -230,9 +230,6 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 1 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 2 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>* -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i64, i64* [[P3]], i64 5 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 @@ -250,6 +247,9 @@ ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 ; CHECK-NEXT: store i64 5, i64* [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[P3]] to <4 x i64>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i64> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[P3]] to <4 x i64>* ; CHECK-NEXT: store <4 x i64> [[TMP4]], <4 x i64>* [[TMP5]], align 8 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: [[DST_ADDR_014:%.*]] = phi double* [ [[ADD_PTR4:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_013:%.*]] = phi double* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[SRC_ADDR_013]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST_ADDR_014]], i64 1 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[DST_ADDR_014]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds double, double* [[SRC_ADDR_013]], i64 [[I_015]] @@ -62,9 +62,9 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC_ADDR_021]] to <4 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[DST_ADDR_022]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] @@ -165,16 +165,16 @@ ; CHECK-NEXT: [[I_023:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[DST_ADDR_022:%.*]] = phi float* [ [[ADD_PTR8:%.*]], [[FOR_BODY]] ], [ [[DST:%.*]], [[ENTRY]] ] ; CHECK-NEXT: [[SRC_ADDR_021:%.*]] = phi float* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[SRC:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 4 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC_ADDR_021]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -39,10 +39,10 @@ define void @uitofp_4i64_4f64() #0 { ; SSE-LABEL: @uitofp_4i64_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -70,25 +70,25 @@ define void @uitofp_8i64_8f64() #0 { ; SSE-LABEL: @uitofp_8i64_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i64> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i64> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 2) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i64> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <2 x i64>*), align 32 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i64> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 6) to <2 x i64>*), align 16 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i64> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i64_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -171,10 +171,10 @@ define void @uitofp_4i32_4f64() #0 { ; SSE-LABEL: @uitofp_4i32_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -202,25 +202,25 @@ define void @uitofp_8i32_8f64() #0 { ; SSE-LABEL: @uitofp_8i32_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* bitcast ([16 x i32]* @src32 to <2 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i32> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 2) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i32> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <2 x i32>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i32> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 6) to <2 x i32>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i32> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i32_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -276,10 +276,10 @@ define void @uitofp_4i16_4f64() #0 { ; SSE-LABEL: @uitofp_4i16_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -307,25 +307,25 @@ define void @uitofp_8i16_8f64() #0 { ; SSE-LABEL: @uitofp_8i16_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i16>, <2 x i16>* bitcast ([32 x i16]* @src16 to <2 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i16> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i16> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 2) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <2 x i16>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i16> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i16>, <2 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 6) to <2 x i16>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i16> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i16_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -381,10 +381,10 @@ define void @uitofp_4i8_4f64() #0 { ; SSE-LABEL: @uitofp_4i8_4f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP3]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; @@ -412,25 +412,25 @@ define void @uitofp_8i8_8f64() #0 { ; SSE-LABEL: @uitofp_8i8_8f64( ; SSE-NEXT: [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* bitcast ([64 x i8]* @src8 to <2 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 -; SSE-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i8> [[TMP4]] to <2 x double> -; SSE-NEXT: store <2 x double> [[TMP5]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 -; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 -; SSE-NEXT: store <2 x double> [[TMP7]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <2 x i8> [[TMP1]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP2]], <2 x double>* bitcast ([8 x double]* @dst64 to <2 x double>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 2) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <2 x i8> [[TMP3]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP4]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 2) to <2 x double>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <2 x i8>*), align 4 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <2 x i8> [[TMP5]] to <2 x double> +; SSE-NEXT: store <2 x double> [[TMP6]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <2 x double>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 6) to <2 x i8>*), align 2 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <2 x i8> [[TMP7]] to <2 x double> ; SSE-NEXT: store <2 x double> [[TMP8]], <2 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 6) to <2 x double>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_8i8_8f64( ; AVX256-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x double> -; AVX256-NEXT: store <4 x double> [[TMP3]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x double> +; AVX256-NEXT: store <4 x double> [[TMP2]], <4 x double>* bitcast ([8 x double]* @dst64 to <4 x double>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x double> ; AVX256-NEXT: store <4 x double> [[TMP4]], <4 x double>* bitcast (double* getelementptr inbounds ([8 x double], [8 x double]* @dst64, i32 0, i64 4) to <4 x double>*), align 32 ; AVX256-NEXT: ret void ; @@ -515,10 +515,10 @@ define void @uitofp_8i64_8f32() #0 { ; SSE-LABEL: @uitofp_8i64_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @src64 to <4 x i64>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i64> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 4) to <4 x i64>*), align 32 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i64> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -580,10 +580,10 @@ define void @uitofp_8i32_8f32() #0 { ; SSE-LABEL: @uitofp_8i32_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -623,25 +623,25 @@ define void @uitofp_16i32_16f32() #0 { ; SSE-LABEL: @uitofp_16i32_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @src32 to <4 x i32>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i32> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i32> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i32> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <4 x i32>*), align 32 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i32> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 12) to <4 x i32>*), align 16 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i32> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_16i32_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @src32 to <8 x i32>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i32> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <8 x i32> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @src32, i32 0, i64 8) to <8 x i32>*), align 32 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i32> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -727,10 +727,10 @@ define void @uitofp_8i16_8f32() #0 { ; SSE-LABEL: @uitofp_8i16_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -770,25 +770,25 @@ define void @uitofp_16i16_16f32() #0 { ; SSE-LABEL: @uitofp_16i16_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* bitcast ([32 x i16]* @src16 to <4 x i16>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i16> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i16> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i16> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 4) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i16> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <4 x i16>*), align 16 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i16> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 12) to <4 x i16>*), align 8 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i16> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_16i16_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @src16 to <8 x i16>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <8 x i16> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @src16, i32 0, i64 8) to <8 x i16>*), align 16 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i16> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; @@ -874,10 +874,10 @@ define void @uitofp_8i8_8f32() #0 { ; SSE-LABEL: @uitofp_8i8_8f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP3]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; @@ -917,25 +917,25 @@ define void @uitofp_16i8_16f32() #0 { ; SSE-LABEL: @uitofp_16i8_16f32( ; SSE-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* bitcast ([64 x i8]* @src8 to <4 x i8>*), align 64 -; SSE-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 -; SSE-NEXT: [[TMP5:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float> -; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i8> [[TMP2]] to <4 x float> -; SSE-NEXT: [[TMP7:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float> -; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i8> [[TMP4]] to <4 x float> -; SSE-NEXT: store <4 x float> [[TMP5]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 -; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 -; SSE-NEXT: store <4 x float> [[TMP7]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP2:%.*]] = uitofp <4 x i8> [[TMP1]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP2]], <4 x float>* bitcast ([16 x float]* @dst32 to <4 x float>*), align 64 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 4) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = uitofp <4 x i8> [[TMP3]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP4]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 4) to <4 x float>*), align 16 +; SSE-NEXT: [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <4 x i8>*), align 8 +; SSE-NEXT: [[TMP6:%.*]] = uitofp <4 x i8> [[TMP5]] to <4 x float> +; SSE-NEXT: store <4 x float> [[TMP6]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <4 x float>*), align 32 +; SSE-NEXT: [[TMP7:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 12) to <4 x i8>*), align 4 +; SSE-NEXT: [[TMP8:%.*]] = uitofp <4 x i8> [[TMP7]] to <4 x float> ; SSE-NEXT: store <4 x float> [[TMP8]], <4 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 12) to <4 x float>*), align 16 ; SSE-NEXT: ret void ; ; AVX256-LABEL: @uitofp_16i8_16f32( ; AVX256-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* bitcast ([64 x i8]* @src8 to <8 x i8>*), align 64 -; AVX256-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 -; AVX256-NEXT: [[TMP3:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float> -; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i8> [[TMP2]] to <8 x float> -; AVX256-NEXT: store <8 x float> [[TMP3]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP2:%.*]] = uitofp <8 x i8> [[TMP1]] to <8 x float> +; AVX256-NEXT: store <8 x float> [[TMP2]], <8 x float>* bitcast ([16 x float]* @dst32 to <8 x float>*), align 64 +; AVX256-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([64 x i8], [64 x i8]* @src8, i32 0, i64 8) to <8 x i8>*), align 8 +; AVX256-NEXT: [[TMP4:%.*]] = uitofp <8 x i8> [[TMP3]] to <8 x float> ; AVX256-NEXT: store <8 x float> [[TMP4]], <8 x float>* bitcast (float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 8) to <8 x float>*), align 32 ; AVX256-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-alt-shuffle.ll @@ -8,19 +8,19 @@ ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 1 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 2 ; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[C]], i64 3 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1 +; CHECK-NEXT: [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2 +; CHECK-NEXT: [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3 +; CHECK-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ARRAYIDX4]] to <4 x i8>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw <4 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 -1 -; CHECK-NEXT: [[ADD_PTR37:%.*]] = getelementptr inbounds float, float* [[D]], i64 -2 -; CHECK-NEXT: [[ADD_PTR45:%.*]] = getelementptr inbounds float, float* [[D]], i64 -3 ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> poison, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = sitofp <4 x i32> [[TMP6]] to <4 x float> ; CHECK-NEXT: [[TMP8:%.*]] = fdiv <4 x float> [[TMP7]], poison -; CHECK-NEXT: [[ADD_PTR53:%.*]] = getelementptr inbounds float, float* [[D]], i64 -4 ; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ADD_PTR53]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP9]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reordered-list.ll @@ -6,10 +6,10 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[ISEC:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 3 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[ISEC]], i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX10]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX2]] to <2 x double>* ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -17,18 +17,18 @@ ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double -; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP8]], <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP7]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> poison, double [[TMP10]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP8]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP12]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP4]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = fcmp ogt <4 x double> [[TMP13]], [[TMP9]] +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = fptrunc <4 x double> [[TMP15]] to <4 x float> ; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP2]], <4 x float> [[TMP16]] ; CHECK-NEXT: br label [[BB3]] diff --git a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll --- a/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll +++ b/llvm/test/Transforms/SLPVectorizer/int_sideeffect.ll @@ -12,9 +12,9 @@ ; CHECK-NEXT: [[P2:%.*]] = getelementptr float, float* [[P]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr float, float* [[P]], i64 3 ; CHECK-NEXT: call void @llvm.sideeffect() +; CHECK-NEXT: call void @llvm.sideeffect() ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: call void @llvm.sideeffect() ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void @@ -45,9 +45,9 @@ ; CHECK-NEXT: [[P2:%.*]] = getelementptr float, float* [[P]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr float, float* [[P]], i64 3 ; CHECK-NEXT: call void @foo() #[[ATTR1:[0-9]+]] +; CHECK-NEXT: call void @foo() #[[ATTR1]] ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P0]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: call void @foo() #[[ATTR1]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[P0]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void