Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -706,14 +706,9 @@ // If this is a zext/sext of a load, return 0 if the corresponding // extending load exists on target. - if (I && isa(I->getOperand(0))) { - EVT ExtVT = EVT::getEVT(Dst); - EVT LoadVT = EVT::getEVT(Src); - unsigned LType = - ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); - if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) + if (auto LI = dyn_cast(I->getOperand(0))) + if (getTLI()->isExtLoad(LI, I, DL)) return 0; - } break; case Instruction::AddrSpaceCast: if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -10,7 +10,7 @@ ; CHECK-PWR8: Setting best plan to VF=16, UF=4 -; CHECK-PWR9: Setting best plan to VF=8, UF=8 +; CHECK-PWR9: Setting best plan to VF=16, UF=4 entry: Index: llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll +++ llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-loopid-dbg.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 2, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized Index: llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll +++ llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks.ll @@ -6,7 +6,7 @@ ; DEBUG-OUTPUT-NOT: .loc ; DEBUG-OUTPUT-NOT: {{.*}}.debug_info -; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 4, interleaved count: 1) +; VECTORIZED: remark: vectorization-remarks.c:17:8: vectorized loop (vectorization width: 2, interleaved count: 1) ; UNROLLED: remark: vectorization-remarks.c:17:8: interleaved loop (interleaved count: 4) ; NONE: remark: vectorization-remarks.c:17:8: loop not vectorized: vectorization and interleaving are explicitly disabled, or the loop has already been vectorized Index: llvm/test/Transforms/SLPVectorizer/X86/sext.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -11,15 +11,26 @@ ; define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) { -; SSE-LABEL: @loadext_2i8_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i8_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i8_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i8_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -114,23 +125,42 @@ } define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) { -; SSE-LABEL: @loadext_4i8_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i8_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i8_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -665,15 +695,26 @@ ; define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) { -; SSE-LABEL: @loadext_2i16_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i16_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i16_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i16_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -768,23 +809,42 @@ } define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) { -; SSE-LABEL: @loadext_4i16_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i16_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i16_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -953,15 +1013,26 @@ ; define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) { -; SSE-LABEL: @loadext_2i32_to_2i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: ret <2 x i64> [[V1]] +; SSE2-LABEL: @loadext_2i32_to_2i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: ret <2 x i64> [[V1]] +; +; SLM-LABEL: @loadext_2i32_to_2i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: ret <2 x i64> [[V1]] ; ; AVX-LABEL: @loadext_2i32_to_2i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 @@ -985,23 +1056,42 @@ } define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { -; SSE-LABEL: @loadext_4i32_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i32_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE2-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SSE2-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SSE2-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SSE2-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 +; SSE2-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i32_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1