diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -117,6 +117,7 @@ unsigned getNumberOfRegisters(unsigned ClassID) const; TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; + unsigned getMinVectorRegisterBitWidth() const; unsigned getLoadStoreVecRegBitWidth(unsigned AS) const; unsigned getMaxInterleaveFactor(unsigned VF); InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -150,6 +150,8 @@ llvm_unreachable("Unsupported register kind"); } +unsigned X86TTIImpl::getMinVectorRegisterBitWidth() const { return 64; } + unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const { return getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedSize(); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -348,22 +348,18 @@ define void @no_vec_shuff_reorder() #0 { ; CHECK-LABEL: @no_vec_shuff_reorder( -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = fadd float [[TMP1]], [[TMP2]] -; CHECK-NEXT: store float [[TMP3]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP6:%.*]] = fsub float [[TMP4]], [[TMP5]] -; CHECK-NEXT: store float [[TMP6]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 1), align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = fadd float [[TMP7]], [[TMP8]] -; CHECK-NEXT: store float [[TMP9]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 3), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fsub float [[TMP10]], [[TMP11]] -; CHECK-NEXT: store float [[TMP12]], float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 3), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fa to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([4 x float]* @fb to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP4]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP5]], <2 x float>* bitcast ([4 x float]* @fc to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 2) to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fa, i32 0, i64 2) to <2 x float>*), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fsub <2 x float> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP10]], <2 x float>* bitcast (float* getelementptr inbounds ([4 x float], [4 x float]* @fc, i32 0, i64 2) to <2 x float>*), align 4 ; CHECK-NEXT: ret void ; %1 = load float, float* getelementptr inbounds ([4 x float], [4 x float]* @fb, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll @@ -20,13 +20,13 @@ ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SSE-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R21]], float [[AB3]], i32 3 ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R71]] +; SSE-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R73]] ; ; SLM-LABEL: @ceil_floor( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 @@ -41,34 +41,34 @@ ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SLM-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R21]], float [[AB3]], i32 3 ; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R71]] +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R73]] ; ; AVX-LABEL: @ceil_floor( ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> poison, float [[AB0]], i32 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; AVX-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> ; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[R42:%.*]] = shufflevector <8 x float> [[R21]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R42]], float [[AB5]], i32 5 ; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R71]] +; AVX-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP9]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R73]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll @@ -20,13 +20,13 @@ ; SSE-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SSE-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R21]], float [[AB3]], i32 3 ; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; SSE-NEXT: ret <8 x float> [[R71]] +; SSE-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SSE-NEXT: ret <8 x float> [[R73]] ; ; SLM-LABEL: @ceil_floor( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 @@ -41,34 +41,34 @@ ; SLM-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 ; SLM-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; SLM-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R21]], float [[AB3]], i32 3 ; SLM-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> ; SLM-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> ; SLM-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; SLM-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; SLM-NEXT: ret <8 x float> [[R71]] +; SLM-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> +; SLM-NEXT: ret <8 x float> [[R73]] ; ; AVX-LABEL: @ceil_floor( ; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[AB0:%.*]] = call float @llvm.ceil.f32(float [[A0]]) ; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP1]]) -; AVX-NEXT: [[AB3:%.*]] = call float @llvm.ceil.f32(float [[A3]]) -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP4:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[TMP3]]) +; AVX-NEXT: [[AB5:%.*]] = call float @llvm.ceil.f32(float [[A5]]) ; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <2 x i32> ; AVX-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[TMP5]]) ; AVX-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 ; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R23:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R23]], float [[AB3]], i32 3 +; AVX-NEXT: [[R21:%.*]] = shufflevector <8 x float> [[R0]], <8 x float> [[TMP7]], <8 x i32> ; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R52:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[R42:%.*]] = shufflevector <8 x float> [[R21]], <8 x float> [[TMP8]], <8 x i32> +; AVX-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R42]], float [[AB5]], i32 5 ; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[R71:%.*]] = shufflevector <8 x float> [[R52]], <8 x float> [[TMP9]], <8 x i32> -; AVX-NEXT: ret <8 x float> [[R71]] +; AVX-NEXT: [[R73:%.*]] = shufflevector <8 x float> [[R5]], <8 x float> [[TMP9]], <8 x i32> +; AVX-NEXT: ret <8 x float> [[R73]] ; %a0 = extractelement <8 x float> %a, i32 0 %a1 = extractelement <8 x float> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -436,38 +436,36 @@ ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @sdiv_v8i32_undefs( -; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R71]] +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX2-NEXT: [[R21:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R21]], i32 [[AB3]], i32 3 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R62:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R62]], i32 [[AB7]], i32 7 +; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @sdiv_v8i32_undefs( -; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> poison, i32 [[AB1]], i32 1 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R71]] +; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX512-NEXT: [[R21:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R21]], i32 [[AB3]], i32 3 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R62:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R62]], i32 [[AB7]], i32 7 +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -436,38 +436,36 @@ ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @sdiv_v8i32_undefs( -; AVX2-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; AVX2-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX2-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX2-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX2-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> -; AVX2-NEXT: ret <8 x i32> [[R71]] +; AVX2-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX2-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX2-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX2-NEXT: [[R21:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R21]], i32 [[AB3]], i32 3 +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; AVX2-NEXT: [[R62:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP5]], <8 x i32> +; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R62]], i32 [[AB7]], i32 7 +; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @sdiv_v8i32_undefs( -; AVX512-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; AVX512-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; AVX512-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], -; AVX512-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> -; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> , i32 [[AB1]], i32 1 -; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R32:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP5]], <8 x i32> -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R32]], i32 [[AB5]], i32 5 -; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; AVX512-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP6]], <8 x i32> -; AVX512-NEXT: ret <8 x i32> [[R71]] +; AVX512-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 3 +; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[TMP1]], +; AVX512-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <2 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = sdiv <2 x i32> [[TMP3]], +; AVX512-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX512-NEXT: [[R21:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R21]], i32 [[AB3]], i32 3 +; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; AVX512-NEXT: [[R62:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP5]], <8 x i32> +; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R62]], i32 [[AB7]], i32 7 +; AVX512-NEXT: ret <8 x i32> [[R7]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -32,22 +32,24 @@ ; SSE-NEXT: store i8 [[TMP7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 6), align 1 ; SSE-NEXT: [[TMP8:%.*]] = xor i8 [[C]], [[B]] ; SSE-NEXT: store i8 [[TMP8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 7), align 1 -; SSE-NEXT: [[TMP9:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8), align 1 -; SSE-NEXT: [[TMP10:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 9), align 1 -; SSE-NEXT: [[TMP11:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 10), align 1 -; SSE-NEXT: [[TMP12:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 11), align 1 -; SSE-NEXT: [[TMP13:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 12), align 1 -; SSE-NEXT: [[TMP14:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 13), align 1 -; SSE-NEXT: [[TMP15:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 14), align 1 -; SSE-NEXT: [[TMP16:%.*]] = xor i8 [[A]], [[C]] -; SSE-NEXT: store i8 [[TMP16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 15), align 1 +; SSE-NEXT: [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i32 0 +; SSE-NEXT: [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[A]], i32 1 +; SSE-NEXT: [[TMP11:%.*]] = insertelement <8 x i8> [[TMP10]], i8 [[A]], i32 2 +; SSE-NEXT: [[TMP12:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[A]], i32 3 +; SSE-NEXT: [[TMP13:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[A]], i32 4 +; SSE-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[A]], i32 5 +; SSE-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[A]], i32 6 +; SSE-NEXT: [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[A]], i32 7 +; SSE-NEXT: [[TMP17:%.*]] = insertelement <8 x i8> poison, i8 [[C]], i32 0 +; SSE-NEXT: [[TMP18:%.*]] = insertelement <8 x i8> [[TMP17]], i8 [[C]], i32 1 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <8 x i8> [[TMP18]], i8 [[C]], i32 2 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <8 x i8> [[TMP19]], i8 [[C]], i32 3 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <8 x i8> [[TMP20]], i8 [[C]], i32 4 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[C]], i32 5 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <8 x i8> [[TMP22]], i8 [[C]], i32 6 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <8 x i8> [[TMP23]], i8 [[C]], i32 7 +; SSE-NEXT: [[TMP25:%.*]] = xor <8 x i8> [[TMP16]], [[TMP24]] +; SSE-NEXT: store <8 x i8> [[TMP25]], <8 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 8) to <8 x i8>*), align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @splat( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -11,27 +11,23 @@ ; CHECK-LABEL: @LzmaDec_DecodeReal2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P:%.*]], i64 0, i32 4 -; CHECK-NEXT: [[CODE21_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P]], i64 0, i32 5 ; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] ; CHECK: do.body66.i: -; CHECK-NEXT: [[RANGE_2_I:%.*]] = phi i32 [ [[RANGE_4_I:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[CODE_2_I:%.*]] = phi i32 [ [[CODE_4_I:%.*]], [[DO_COND_I]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[DOTRANGE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_2_I]] -; CHECK-NEXT: [[DOTCODE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_2_I]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> , i32 [[TMP2]], i32 1 ; CHECK-NEXT: br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] ; CHECK: if.else.i: -; CHECK-NEXT: [[SUB91_I:%.*]] = sub i32 [[DOTRANGE_2_I]], undef -; CHECK-NEXT: [[SUB92_I:%.*]] = sub i32 [[DOTCODE_2_I]], undef +; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], undef ; CHECK-NEXT: br label [[DO_COND_I]] ; CHECK: do.cond.i: -; CHECK-NEXT: [[RANGE_4_I]] = phi i32 [ [[SUB91_I]], [[IF_ELSE_I]] ], [ undef, [[DO_BODY66_I]] ] -; CHECK-NEXT: [[CODE_4_I]] = phi i32 [ [[SUB92_I]], [[IF_ELSE_I]] ], [ [[DOTCODE_2_I]], [[DO_BODY66_I]] ] +; CHECK-NEXT: [[TMP5]] = phi <2 x i32> [ [[TMP4]], [[IF_ELSE_I]] ], [ [[TMP3]], [[DO_BODY66_I]] ] ; CHECK-NEXT: br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] ; CHECK: do.end1006.i: -; CHECK-NEXT: [[DOTRANGE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_4_I]] -; CHECK-NEXT: [[DOTCODE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_4_I]] -; CHECK-NEXT: store i32 [[DOTRANGE_4_I]], i32* [[RANGE20_I]], align 4 -; CHECK-NEXT: store i32 [[DOTCODE_4_I]], i32* [[CODE21_I]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[RANGE20_I]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -14,23 +14,18 @@ ; CHECK-NEXT: ret void ; CHECK: if.else: ; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 -; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] ; CHECK: land.lhs.true.i.1: ; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] ; CHECK: if.then7.1: -; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 0, 1 -; CHECK-NEXT: store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 -; CHECK-NEXT: [[DEC_1:%.*]] = add nsw i32 6, -1 -; CHECK-NEXT: store i32 [[DEC_1]], i32* [[NUB5]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> , <2 x i32>* [[TMP0]], align 4 ; CHECK-NEXT: br label [[FOR_INC_1]] ; CHECK: for.inc.1: -; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] -; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 -; CHECK-NEXT: store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 -; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 -; CHECK-NEXT: store i32 [[DEC_2]], i32* [[NUB5]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ , [[IF_THEN7_1]] ], [ , [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[M_NUMCONSTRAINTROWS4]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: unreachable ; entry: @@ -74,15 +69,14 @@ ; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1 ; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS]], i64 0, i32 2, i64 0, i32 0, i64 2 ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX36]], align 4 -; CHECK-NEXT: [[ADD587:%.*]] = fadd float undef, undef -; CHECK-NEXT: [[SUB600:%.*]] = fsub float [[ADD587]], undef -; CHECK-NEXT: store float [[SUB600]], float* undef, align 4 -; CHECK-NEXT: [[SUB613:%.*]] = fsub float [[ADD587]], [[SUB600]] -; CHECK-NEXT: store float [[SUB613]], float* [[ARRAYIDX26]], align 4 -; CHECK-NEXT: [[ADD626:%.*]] = fadd float [[TMP0]], undef -; CHECK-NEXT: [[SUB639:%.*]] = fsub float [[ADD626]], undef -; CHECK-NEXT: [[SUB652:%.*]] = fsub float [[ADD626]], [[SUB639]] -; CHECK-NEXT: store float [[SUB652]], float* [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> undef, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP2]], undef +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: store float [[TMP4]], float* undef, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x float> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX26]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP5]], <2 x float>* [[TMP6]], align 4 ; CHECK-NEXT: br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]] ; CHECK: if.then1595: ; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll @@ -24,34 +24,29 @@ ; CHECK: for.body233: ; CHECK-NEXT: br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]] ; CHECK: for.end271: -; CHECK-NEXT: [[TMP0:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] -; CHECK-NEXT: [[SUB275:%.*]] = fsub float undef, [[TMP1]] -; CHECK-NEXT: [[SUB279:%.*]] = fsub float undef, [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ , [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] +; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> undef, [[TMP0]] ; CHECK-NEXT: br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]] ; CHECK: if.then291: -; CHECK-NEXT: [[MUL292:%.*]] = fmul float [[SUB275]], 5.000000e-01 -; CHECK-NEXT: [[ADD294:%.*]] = fadd float [[TMP1]], [[MUL292]] -; CHECK-NEXT: [[MUL295:%.*]] = fmul float [[SUB279]], 5.000000e-01 -; CHECK-NEXT: [[ADD297:%.*]] = fadd float [[TMP0]], [[MUL295]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP0]], [[TMP2]] ; CHECK-NEXT: br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]] ; CHECK: if.else319: ; CHECK-NEXT: br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]] ; CHECK: if.then325: ; CHECK-NEXT: br label [[IF_END327]] ; CHECK: if.end327: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> , float [[TMP4]], i32 0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]] ; CHECK: if.then329: ; CHECK-NEXT: br label [[IF_END332]] ; CHECK: if.end332: -; CHECK-NEXT: [[DX272_1:%.*]] = phi float [ [[SUB275]], [[IF_THEN329]] ], [ [[SUB275]], [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] -; CHECK-NEXT: [[DY276_1:%.*]] = phi float [ undef, [[IF_THEN329]] ], [ undef, [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] -; CHECK-NEXT: [[SUB334:%.*]] = fsub float [[ADD294]], [[DX272_1]] -; CHECK-NEXT: [[SUB338:%.*]] = fsub float [[ADD297]], [[DY276_1]] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[IF_THEN329]] ], [ [[TMP5]], [[IF_END327]] ], [ , [[IF_THEN291]] ] +; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP3]], [[TMP6]] ; CHECK-NEXT: [[ARRAYIDX_I_I606:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113:%.*]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES:%.*]], i64 0, i32 0, i64 0 -; CHECK-NEXT: store float [[SUB334]], float* [[ARRAYIDX_I_I606]], align 4 -; CHECK-NEXT: [[ARRAYIDX3_I607:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES]], i64 0, i32 0, i64 1 -; CHECK-NEXT: store float [[SUB338]], float* [[ARRAYIDX3_I607]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX_I_I606]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP7]], <2 x float>* [[TMP8]], align 4 ; CHECK-NEXT: br label [[RETURN]] ; CHECK: return: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -27,25 +27,24 @@ ; CHECK: land.rhs.lr.ph: ; CHECK-NEXT: unreachable ; CHECK: if.end98: -; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 ; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] ; CHECK: if.then103: ; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef ; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 -; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 ; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[COND125]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[DOTSUB100]], i32 1 ; CHECK-NEXT: br label [[FOR_COND_I:%.*]] ; CHECK: for.cond.i: -; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] -; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ undef, [[LAND_RHS_I874:%.*]] ], [ [[TMP1]], [[IF_THEN103]] ] ; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] ; CHECK: land.rhs.i874: ; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] ; CHECK: for.end.i: ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] ; CHECK: if.then.i: -; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef -; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], undef ; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] ; CHECK: if.end.i: ; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] @@ -66,14 +65,12 @@ ; CHECK: while.end275.i: ; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] ; CHECK: extend_bw.exit: -; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] -; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ [[TMP3]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] ; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] ; CHECK: if.then157: -; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 -; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 -; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 -; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[FROM1115]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 ; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] ; CHECK: land.lhs.true167: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi-inseltpoison.ll @@ -195,30 +195,9 @@ define void @fptosi_8f64_8i8() #0 { ; CHECK-LABEL: @fptosi_8f64_8i8( -; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 @@ -428,30 +407,9 @@ define void @fptosi_8f32_8i8() #0 { ; CHECK-LABEL: @fptosi_8f32_8i8( -; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptosi.ll @@ -195,30 +195,9 @@ define void @fptosi_8f64_8i8() #0 { ; CHECK-LABEL: @fptosi_8f64_8i8( -; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = fptosi double [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi double [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi double [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi double [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptosi double [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptosi double [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptosi double [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptosi double [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x double> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 @@ -428,30 +407,9 @@ define void @fptosi_8f32_8i8() #0 { ; CHECK-LABEL: @fptosi_8f32_8i8( -; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[CVT0:%.*]] = fptosi float [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptosi float [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptosi float [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptosi float [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptosi float [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptosi float [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptosi float [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptosi float [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <8 x float> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fptoui.ll @@ -195,30 +195,9 @@ define void @fptoui_8f64_8i8() #0 { ; CHECK-LABEL: @fptoui_8f64_8i8( -; CHECK-NEXT: [[A0:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 7), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = fptoui double [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptoui double [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptoui double [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptoui double [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptoui double [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptoui double [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptoui double [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptoui double [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, <8 x double>* bitcast ([8 x double]* @src64 to <8 x double>*), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x double> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load double, double* getelementptr inbounds ([8 x double], [8 x double]* @src64, i32 0, i64 0), align 8 @@ -428,30 +407,9 @@ define void @fptoui_8f32_8i8() #0 { ; CHECK-LABEL: @fptoui_8f32_8i8( -; CHECK-NEXT: [[A0:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 -; CHECK-NEXT: [[A1:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 1), align 4 -; CHECK-NEXT: [[A2:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 2), align 4 -; CHECK-NEXT: [[A3:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 3), align 4 -; CHECK-NEXT: [[A4:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 4), align 4 -; CHECK-NEXT: [[A5:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 5), align 4 -; CHECK-NEXT: [[A6:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 6), align 4 -; CHECK-NEXT: [[A7:%.*]] = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 7), align 4 -; CHECK-NEXT: [[CVT0:%.*]] = fptoui float [[A0]] to i8 -; CHECK-NEXT: [[CVT1:%.*]] = fptoui float [[A1]] to i8 -; CHECK-NEXT: [[CVT2:%.*]] = fptoui float [[A2]] to i8 -; CHECK-NEXT: [[CVT3:%.*]] = fptoui float [[A3]] to i8 -; CHECK-NEXT: [[CVT4:%.*]] = fptoui float [[A4]] to i8 -; CHECK-NEXT: [[CVT5:%.*]] = fptoui float [[A5]] to i8 -; CHECK-NEXT: [[CVT6:%.*]] = fptoui float [[A6]] to i8 -; CHECK-NEXT: [[CVT7:%.*]] = fptoui float [[A7]] to i8 -; CHECK-NEXT: store i8 [[CVT0]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 0), align 1 -; CHECK-NEXT: store i8 [[CVT1]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 1), align 1 -; CHECK-NEXT: store i8 [[CVT2]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 2), align 1 -; CHECK-NEXT: store i8 [[CVT3]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 3), align 1 -; CHECK-NEXT: store i8 [[CVT4]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 4), align 1 -; CHECK-NEXT: store i8 [[CVT5]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 5), align 1 -; CHECK-NEXT: store i8 [[CVT6]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 6), align 1 -; CHECK-NEXT: store i8 [[CVT7]], i8* getelementptr inbounds ([64 x i8], [64 x i8]* @dst8, i32 0, i64 7), align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([16 x float]* @src32 to <8 x float>*), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fptoui <8 x float> [[TMP1]] to <8 x i8> +; CHECK-NEXT: store <8 x i8> [[TMP2]], <8 x i8>* bitcast ([64 x i8]* @dst8 to <8 x i8>*), align 1 ; CHECK-NEXT: ret void ; %a0 = load float, float* getelementptr inbounds ([16 x float], [16 x float]* @src32, i32 0, i64 0), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertvalue.ll @@ -216,24 +216,21 @@ define void @julia_load_array_of_i16([4 x i16]* %a, [4 x i16]* %b, [4 x i16]* %c) { ; CHECK-LABEL: @julia_load_array_of_i16( ; CHECK-NEXT: top: -; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A:%.*]], align 4 -; CHECK-NEXT: [[A0:%.*]] = extractvalue [4 x i16] [[A_ARR]], 0 -; CHECK-NEXT: [[A2:%.*]] = extractvalue [4 x i16] [[A_ARR]], 2 -; CHECK-NEXT: [[A1:%.*]] = extractvalue [4 x i16] [[A_ARR]], 1 -; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B:%.*]], align 4 -; CHECK-NEXT: [[B0:%.*]] = extractvalue [4 x i16] [[B_ARR]], 0 -; CHECK-NEXT: [[B2:%.*]] = extractvalue [4 x i16] [[B_ARR]], 2 -; CHECK-NEXT: [[B1:%.*]] = extractvalue [4 x i16] [[B_ARR]], 1 -; CHECK-NEXT: [[A3:%.*]] = extractvalue [4 x i16] [[A_ARR]], 3 -; CHECK-NEXT: [[C1:%.*]] = sub i16 [[A1]], [[B1]] -; CHECK-NEXT: [[B3:%.*]] = extractvalue [4 x i16] [[B_ARR]], 3 -; CHECK-NEXT: [[C0:%.*]] = sub i16 [[A0]], [[B0]] -; CHECK-NEXT: [[C2:%.*]] = sub i16 [[A2]], [[B2]] -; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[C0]], 0 -; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[C1]], 1 -; CHECK-NEXT: [[C3:%.*]] = sub i16 [[A3]], [[B3]] -; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[C2]], 2 -; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[C3]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast [4 x i16]* [[A:%.*]] to <4 x i16>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 4 +; CHECK-NEXT: [[A_ARR:%.*]] = load [4 x i16], [4 x i16]* [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [4 x i16]* [[B:%.*]] to <4 x i16>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP2]], align 4 +; CHECK-NEXT: [[B_ARR:%.*]] = load [4 x i16], [4 x i16]* [[B]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i16> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP4]], i32 0 +; CHECK-NEXT: [[C_ARR0:%.*]] = insertvalue [4 x i16] undef, i16 [[TMP5]], 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP4]], i32 1 +; CHECK-NEXT: [[C_ARR1:%.*]] = insertvalue [4 x i16] [[C_ARR0]], i16 [[TMP6]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[C_ARR2:%.*]] = insertvalue [4 x i16] [[C_ARR1]], i16 [[TMP7]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[C_ARR3:%.*]] = insertvalue [4 x i16] [[C_ARR2]], i16 [[TMP8]], 3 ; CHECK-NEXT: store [4 x i16] [[C_ARR3]], [4 x i16]* [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -275,12 +272,12 @@ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[C_STRUCT0:%.*]] = insertvalue [[PSEUDOVEC]] undef, float [[TMP5]], 0 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP4]], i32 1 -; CHECK-NEXT: [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct0, float [[TMP6]], 1 +; CHECK-NEXT: [[C_STRUCT1:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT0]], float [[TMP6]], 1 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 -; CHECK-NEXT: [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct1, float [[TMP7]], 2 +; CHECK-NEXT: [[C_STRUCT2:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT1]], float [[TMP7]], 2 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP4]], i32 3 -; CHECK-NEXT: [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] %c_struct2, float [[TMP8]], 3 -; CHECK-NEXT: store [[PSEUDOVEC]] %c_struct3, %pseudovec* [[C:%.*]], align 4 +; CHECK-NEXT: [[C_STRUCT3:%.*]] = insertvalue [[PSEUDOVEC]] [[C_STRUCT2]], float [[TMP8]], 3 +; CHECK-NEXT: store [[PSEUDOVEC]] [[C_STRUCT3]], %pseudovec* [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; top: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -77,9 +77,9 @@ ; CHECK-NEXT: [[A_5:%.*]] = getelementptr inbounds float, float* [[A]], i64 5 ; CHECK-NEXT: store float [[L6]], float* [[A_5]], align 4 ; CHECK-NEXT: [[A_6:%.*]] = getelementptr inbounds float, float* [[A]], i64 6 -; CHECK-NEXT: store float 0.000000e+00, float* [[A_6]], align 4 ; CHECK-NEXT: [[A_7:%.*]] = getelementptr inbounds float, float* [[A]], i64 7 -; CHECK-NEXT: store float 0.000000e+00, float* [[A_7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A_6]] to <2 x float>* +; CHECK-NEXT: store <2 x float> zeroinitializer, <2 x float>* [[TMP0]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll @@ -12,22 +12,22 @@ ; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1 ; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2 ; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3 -; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]] -; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]] -; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]] -; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]] -; CHECK-NEXT: [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146 -; CHECK-NEXT: [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[GEP1_0]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], ; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42 ; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0 -; CHECK-NEXT: [[RES0:%.*]] = add nsw i32 [[V0]], [[Y0]] -; CHECK-NEXT: [[RES1:%.*]] = add nsw i32 [[V1]], [[Y1]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]] ; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]] -; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]] -; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]] -; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]] -; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[GEP2_0]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]], align 4 +; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -76,10 +76,10 @@ ; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr i32, i32* [[ARR2]], i32 1 ; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, i32* [[ARR2]], i32 2 ; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, i32* [[ARR2]], i32 3 -; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]] -; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]] -; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]] -; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]] +; CHECK-NEXT: [[V0:%.*]] = load i32, i32* [[GEP1_0]], align 4 +; CHECK-NEXT: [[V1:%.*]] = load i32, i32* [[GEP1_1]], align 4 +; CHECK-NEXT: [[V2:%.*]] = load i32, i32* [[GEP1_2]], align 4 +; CHECK-NEXT: [[V3:%.*]] = load i32, i32* [[GEP1_3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add nsw i32 [[A0:%.*]], 1146 ; CHECK-NEXT: [[Y1:%.*]] = add nsw i32 [[A1:%.*]], 146 ; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42 @@ -88,10 +88,10 @@ ; CHECK-NEXT: [[RES1:%.*]] = urem i32 [[V1]], [[Y1]] ; CHECK-NEXT: [[RES2:%.*]] = urem i32 [[V2]], [[Y2]] ; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]] -; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]] -; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]] -; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]] -; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]] +; CHECK-NEXT: store i32 [[RES0]], i32* [[GEP2_0]], align 4 +; CHECK-NEXT: store i32 [[RES1]], i32* [[GEP2_1]], align 4 +; CHECK-NEXT: store i32 [[RES2]], i32* [[GEP2_2]], align 4 +; CHECK-NEXT: store i32 [[RES3]], i32* [[GEP2_3]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022-inseltpoison.ll @@ -147,19 +147,20 @@ define {%StructTy, float, float} @NonHomogeneousStruct(float *%Ptr) { ; CHECK-LABEL: @NonHomogeneousStruct( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[GEP0]], align 4 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1 -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[GEP1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2 ; CHECK-NEXT: [[L2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3 ; CHECK-NEXT: [[L3:%.*]] = load float, float* [[GEP3]], align 4 -; CHECK-NEXT: [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01 -; CHECK-NEXT: [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], ; CHECK-NEXT: [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01 ; CHECK-NEXT: [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01 -; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0 -; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[FADD1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1 ; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] [[STRUCTIN1]], 0 ; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET0]], float [[FADD2]], 1 ; CHECK-NEXT: [[RET2:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET1]], float [[FADD3]], 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll @@ -64,13 +64,13 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 ; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1 +; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 ; CHECK-NEXT: [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 -; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1 -; CHECK-NEXT: [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] %StructIn1, 0 -; CHECK-NEXT: [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] %StructIn3, 1 +; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1 +; CHECK-NEXT: [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] [[STRUCTIN1]], 0 +; CHECK-NEXT: [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1 ; CHECK-NEXT: ret [2 x %StructTy] [[RET1]] ; %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0 @@ -110,13 +110,13 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 ; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 -; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1 +; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 ; CHECK-NEXT: [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3 -; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1 -; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] %StructIn1, 0 -; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] %StructIn3, 1 +; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN2]], float [[TMP7]], 1 +; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] [[STRUCTIN1]], 0 +; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] [[STRUCTIN3]], 1 ; CHECK-NEXT: ret { [[STRUCTTY]], [[STRUCTTY]] } [[RET1]] ; %GEP0 = getelementptr inbounds float, float* %Ptr, i64 0 @@ -147,20 +147,21 @@ define {%StructTy, float, float} @NonHomogeneousStruct(float *%Ptr) { ; CHECK-LABEL: @NonHomogeneousStruct( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[L0:%.*]] = load float, float* [[GEP0]], align 4 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1 -; CHECK-NEXT: [[L1:%.*]] = load float, float* [[GEP1]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2 ; CHECK-NEXT: [[L2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3 ; CHECK-NEXT: [[L3:%.*]] = load float, float* [[GEP3]], align 4 -; CHECK-NEXT: [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01 -; CHECK-NEXT: [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], ; CHECK-NEXT: [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01 ; CHECK-NEXT: [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01 -; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0 -; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[FADD1]], 1 -; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] %StructIn1, 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] [[STRUCTIN0]], float [[TMP5]], 1 +; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCTTY]], float, float } undef, [[STRUCTTY]] [[STRUCTIN1]], 0 ; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET0]], float [[FADD2]], 1 ; CHECK-NEXT: [[RET2:%.*]] = insertvalue { [[STRUCTTY]], float, float } [[RET1]], float [[FADD3]], 2 ; CHECK-NEXT: ret { [[STRUCTTY]], float, float } [[RET2]] @@ -207,25 +208,25 @@ ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 ; CHECK-NEXT: [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 0 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 -; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] %StructIn0, i16 [[TMP5]], 1 +; CHECK-NEXT: [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN0]], i16 [[TMP5]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 ; CHECK-NEXT: [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0 ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 -; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] %StructIn2, i16 [[TMP7]], 1 +; CHECK-NEXT: [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN2]], i16 [[TMP7]], 1 ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 ; CHECK-NEXT: [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 -; CHECK-NEXT: [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] %StructIn4, i16 [[TMP9]], 1 +; CHECK-NEXT: [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN4]], i16 [[TMP9]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 ; CHECK-NEXT: [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 -; CHECK-NEXT: [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] %StructIn6, i16 [[TMP11]], 1 -; CHECK-NEXT: [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] %StructIn1, 0 -; CHECK-NEXT: [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In0, [[STRUCT1TY]] %StructIn3, 1 -; CHECK-NEXT: [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] %StructIn5, 0 -; CHECK-NEXT: [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In2, [[STRUCT1TY]] %StructIn7, 1 -; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] %Struct2In1, 0 -; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] %Struct2In3, 1 +; CHECK-NEXT: [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] [[STRUCTIN6]], i16 [[TMP11]], 1 +; CHECK-NEXT: [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] [[STRUCTIN1]], 0 +; CHECK-NEXT: [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN0]], [[STRUCT1TY]] [[STRUCTIN3]], 1 +; CHECK-NEXT: [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] [[STRUCTIN5]], 0 +; CHECK-NEXT: [[STRUCT2IN3:%.*]] = insertvalue [[STRUCT2TY]] [[STRUCT2IN2]], [[STRUCT1TY]] [[STRUCTIN7]], 1 +; CHECK-NEXT: [[RET0:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } undef, [[STRUCT2TY]] [[STRUCT2IN1]], 0 +; CHECK-NEXT: [[RET1:%.*]] = insertvalue { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET0]], [[STRUCT2TY]] [[STRUCT2IN3]], 1 ; CHECK-NEXT: ret { [[STRUCT2TY]], [[STRUCT2TY]] } [[RET1]] ; %GEP0 = getelementptr inbounds i16, i16* %Ptr, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_not_all_parts.ll @@ -13,24 +13,25 @@ ; CHECK-NEXT: [[A_088:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD24:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DIFF:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[TMP1]], 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP2]] ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 0 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX6]], align 16 -; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[ADD3]], [[A_088]] -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP1]], 1 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX13]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP1]], 5 -; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX16]], align 4 -; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[TMP1]], 1 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP1]], 5 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, i32* [[DIFF]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[ARRAYIDX2]] to <2 x i32>* +; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i32> [[TMP8]], [[TMP5]] +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP10]], [[A_088]] ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* [[M2]], i64 0, i64 [[INDVARS_IV]], i64 1 -; CHECK-NEXT: store i32 [[ADD17]], i32* [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[ADD24]] = add nsw i32 [[ADD10]], [[ADD17]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[ARRAYIDX6]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP9]], <2 x i32>* [[TMP11]], align 16 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD24]] = add nsw i32 [[ADD10]], [[TMP12]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] @@ -68,11 +69,20 @@ %add24 = add nsw i32 %add10, %add17 ; YAML: Pass: slp-vectorizer - ; YAML-NEXT: Name: NotPossible + ; YAML-NEXT: Name: StoresVectorized ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: - ; YAML-NEXT: - String: 'Cannot SLP vectorize list: vectorization was impossible' - ; YAML-NEXT: - String: ' with available vectorization factors' + ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' + ; YAML-NEXT: - Cost: '-1' + ; YAML-NEXT: - String: ' and with tree size ' + ; YAML-NEXT: - TreeSize: '4' + ; + ; YAML: Pass: slp-vectorizer + ; YAML-NEXT: Name: UnsupportedType + ; YAML-NEXT: Function: foo + ; YAML-NEXT: Args: + ; YAML-NEXT: - String: 'Cannot SLP vectorize list: type ' + ; YAML-NEXT: - String: '<2 x i32> is unsupported by vectorizer' %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond = icmp eq i64 %indvars.iv.next, 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -9,33 +9,38 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP18:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[TMP4]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = load float, float* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = fmul float [[TMP5]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = fmul float [[TMP7]], [[TMP11]] -; CHECK-NEXT: [[TMP14:%.*]] = fsub float [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = fmul float [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul float [[TMP5]], [[TMP11]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd float [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[TMP18]] = fadd float [[TMP3]], [[TMP14]] -; CHECK-NEXT: [[TMP19]] = fadd float [[TMP2]], [[TMP17]] -; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP25:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP24:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], %struct.complex* [[A:%.*]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[A]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP3]] to <2 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B:%.*]], i64 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[B]], i64 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x float> poison, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> [[TMP11]], float [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x float> [[TMP6]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> poison, float [[TMP14]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x float> [[TMP15]], float [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x float> poison, float [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x float> [[TMP17]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fsub <2 x float> [[TMP13]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = fadd <2 x float> [[TMP13]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x float> [[TMP21]], <2 x float> [[TMP22]], <2 x i32> +; CHECK-NEXT: [[TMP24]] = fadd <2 x float> [[TMP2]], [[TMP23]] +; CHECK-NEXT: [[TMP25]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP25]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP26]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 -; CHECK-NEXT: store float [[TMP18]], float* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 -; CHECK-NEXT: store float [[TMP19]], float* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT:%.*]], i32 0, i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], %struct.complex* [[RESULT]], i32 0, i32 1 +; CHECK-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP27]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP24]], <2 x float>* [[TMP29]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -23,42 +23,37 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <2 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] -; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 +; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/saxpy.ll @@ -63,15 +63,11 @@ ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[I:%.*]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[Y:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw i32 undef, [[TMP4]] -; CHECK-NEXT: store i32 [[TMP5]], i32* [[TMP2]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 2 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[Y]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 undef, [[TMP9]] -; CHECK-NEXT: store i32 [[TMP10]], i32* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <2 x i32> undef, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; %1 = add i64 %i, 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -14,14 +14,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], ; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], ; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0) to <2 x i32>*), align 4 +; CHECK-NEXT: [[TMP4:%.*]] = lshr <2 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* bitcast (i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0) to <2 x i32>*), align 4 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget.ll @@ -16,9 +16,9 @@ ; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 ; CHECK-NEXT: [[L1:%.*]] = load float, float* [[A1]], align 4 ; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[L2:%.*]] = load float, float* [[A2]], align 4 ; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3 -; CHECK-NEXT: [[L3:%.*]] = load float, float* [[A3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A2]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() ; CHECK-NEXT: call void @unknown() @@ -51,19 +51,19 @@ ; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B]], i64 1 ; CHECK-NEXT: store float [[L1]], float* [[B1]], align 4 ; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 -; CHECK-NEXT: store float [[L2]], float* [[B2]], align 4 ; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 -; CHECK-NEXT: store float [[L3]], float* [[B3]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B2]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP1]], <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[C1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i64 1 ; CHECK-NEXT: [[C2:%.*]] = getelementptr inbounds float, float* [[C]], i64 2 ; CHECK-NEXT: [[C3:%.*]] = getelementptr inbounds float, float* [[C]], i64 3 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[C]] to <4 x float>* -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C]] to <4 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[D1:%.*]] = getelementptr inbounds float, float* [[D:%.*]], i64 1 ; CHECK-NEXT: [[D2:%.*]] = getelementptr inbounds float, float* [[D]], i64 2 ; CHECK-NEXT: [[D3:%.*]] = getelementptr inbounds float, float* [[D]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[D]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[D]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/simple-loop.ll @@ -71,38 +71,28 @@ ; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[TMP26:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ] +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[TMP18:%.*]], [[DOTLR_PH]] ], [ 0, [[TMP0:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[I_019]], 2 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[TMP2]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[TMP2]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = or i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP4]], 7 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 7 -; CHECK-NEXT: [[TMP16:%.*]] = mul i32 [[TMP7]], 7 -; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], 14 -; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[TMP10]], 7 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 21 -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[TMP13]], 7 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP20]], 28 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]] -; CHECK-NEXT: store i32 [[TMP15]], i32* [[TMP22]], align 4 -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP5]] -; CHECK-NEXT: store i32 [[TMP17]], i32* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP2]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[IN]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = mul <2 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP11]], <2 x i32>* [[TMP15]], align 4 ; CHECK-NEXT: [[BARRIER:%.*]] = call i32 @goo(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP8]] -; CHECK-NEXT: store i32 [[TMP19]], i32* [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP11]] -; CHECK-NEXT: store i32 [[TMP21]], i32* [[TMP25]], align 4 -; CHECK-NEXT: [[TMP26]] = add i64 [[I_019]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[TMP26]], [[N]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[TMP17]], align 4 +; CHECK-NEXT: [[TMP18]] = add i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[TMP18]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]] ; CHECK: ._crit_edge: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -535,14 +535,35 @@ ; define void @sitofp_2i64_2f32() #0 { -; CHECK-LABEL: @sitofp_2i64_2f32( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i64_2f32( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: ret void +; +; AVX256NODQ-LABEL: @sitofp_2i64_2f32( +; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ-NEXT: ret void +; +; AVX512-LABEL: @sitofp_2i64_2f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256DQ-LABEL: @sitofp_2i64_2f32( +; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX256DQ-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -535,14 +535,35 @@ ; define void @sitofp_2i64_2f32() #0 { -; CHECK-LABEL: @sitofp_2i64_2f32( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float -; CHECK-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float -; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @sitofp_2i64_2f32( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; SSE-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: ret void +; +; AVX256NODQ-LABEL: @sitofp_2i64_2f32( +; AVX256NODQ-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX256NODQ-NEXT: [[CVT0:%.*]] = sitofp i64 [[LD0]] to float +; AVX256NODQ-NEXT: [[CVT1:%.*]] = sitofp i64 [[LD1]] to float +; AVX256NODQ-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX256NODQ-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX256NODQ-NEXT: ret void +; +; AVX512-LABEL: @sitofp_2i64_2f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256DQ-LABEL: @sitofp_2i64_2f32( +; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX256DQ-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -172,13 +172,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 1 ; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX3]], align 4 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 2 -; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 3 -; CHECK-NEXT: store float [[TMP3]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[ARRAYIDX5]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, float* [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, float* [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/uitofp.ll @@ -472,14 +472,44 @@ ; define void @uitofp_2i64_2f32() #0 { -; CHECK-LABEL: @uitofp_2i64_2f32( -; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 -; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 -; CHECK-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float -; CHECK-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float -; CHECK-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 -; CHECK-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 -; CHECK-NEXT: ret void +; SSE-LABEL: @uitofp_2i64_2f32( +; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; SSE-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; SSE-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; SSE-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; SSE-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @uitofp_2i64_2f32( +; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX1-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX1-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX1-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX1-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX1-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @uitofp_2i64_2f32( +; AVX2-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 +; AVX2-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 +; AVX2-NEXT: [[CVT0:%.*]] = uitofp i64 [[LD0]] to float +; AVX2-NEXT: [[CVT1:%.*]] = uitofp i64 [[LD1]] to float +; AVX2-NEXT: store float [[CVT0]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 0), align 64 +; AVX2-NEXT: store float [[CVT1]], float* getelementptr inbounds ([16 x float], [16 x float]* @dst32, i32 0, i64 1), align 4 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @uitofp_2i64_2f32( +; AVX512-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX512-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX512-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX512-NEXT: ret void +; +; AVX256DQ-LABEL: @uitofp_2i64_2f32( +; AVX256DQ-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @src64 to <2 x i64>*), align 64 +; AVX256DQ-NEXT: [[TMP2:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x float> +; AVX256DQ-NEXT: store <2 x float> [[TMP2]], <2 x float>* bitcast ([16 x float]* @dst32 to <2 x float>*), align 64 +; AVX256DQ-NEXT: ret void ; %ld0 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 0), align 64 %ld1 = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @src64, i32 0, i64 1), align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -47,17 +47,16 @@ ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP5]], 3 ; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -95,13 +94,12 @@ ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -214,13 +212,14 @@ ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 ; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[INCDEC_PTR2]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <2 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <2 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[INCDEC_PTR3]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,21 +247,22 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 +; CHECK-NEXT: store i32 [[TMP6]], i32* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP7]], -3 ; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -291,21 +291,20 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <2 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <2 x i32> [[TMP1]], ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP2]], <2 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 +; CHECK-NEXT: store i32 [[TMP4]], i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP5]], -9 ; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -338,17 +337,16 @@ ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 ; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[INCDEC_PTR]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i32> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[INCDEC_PTR1]] to <2 x i32>* +; CHECK-NEXT: store <2 x i32> [[TMP3]], <2 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP5]], 3 ; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -457,17 +455,16 @@ ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <2 x float> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP5]], 3.000000e+00 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -505,13 +502,12 @@ ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -624,13 +620,14 @@ ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[INCDEC_PTR3]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float>* [[TMP7]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,21 +655,22 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 +; CHECK-NEXT: store float [[TMP6]], float* [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP7]], -3.000000e+00 ; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 ; CHECK-NEXT: ret void ; @@ -701,21 +699,20 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP4]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -786,17 +783,16 @@ ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 ; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], 2.000000e+00 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[INCDEC_PTR]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], 3.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[INCDEC_PTR1]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP5]], 3.000000e+00 ; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; @@ -834,13 +830,12 @@ ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 ; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[INCDEC_PTR2]] to <2 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[INCDEC_PTR4]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP4]], <2 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -944,21 +939,20 @@ ; CHECK-LABEL: @mulfn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul float [[TMP1]], -3.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x float> [[TMP1]], ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP3]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 +; CHECK-NEXT: store float [[TMP4]], float* [[INCDEC_PTR4]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 +; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP5]], -9.000000e+00 ; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -174,9 +174,9 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -186,8 +186,9 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %b = bitcast <4 x float>* %p to float* @@ -200,9 +201,9 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -214,9 +215,9 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b = bitcast <16 x i8>* %p to i32* @@ -229,8 +230,9 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -243,8 +245,9 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> addrspace(44)* [[P:%.*]] to <2 x float> addrspace(44)* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(44)* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 @@ -258,9 +261,9 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 @@ -272,16 +275,12 @@ ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 2 @@ -292,16 +291,12 @@ ; Verify that alignment of the new load is not over-specified. define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 8 @@ -316,9 +311,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[GEP]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1 @@ -330,9 +325,10 @@ define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[GEP]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12 @@ -366,9 +362,9 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -442,8 +438,9 @@ define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -470,9 +467,9 @@ define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_align( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -484,8 +481,9 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_deref( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> poison, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -495,9 +493,9 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -507,8 +505,9 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32>* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -519,9 +518,9 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v16f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -531,9 +530,9 @@ define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -586,9 +585,8 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4 @@ -599,9 +597,9 @@ define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <8 x float>, <8 x float>* %p, align 4 @@ -628,18 +626,12 @@ ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[TMP1]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1 %l = load <2 x i16>, <2 x i16>* %gep, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2 -; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2 +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -174,9 +174,9 @@ define <4 x float> @load_f32_insert_v4f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -186,8 +186,9 @@ define <4 x float> @casted_load_f32_insert_v4f32(<4 x float>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %b = bitcast <4 x float>* %p to float* @@ -200,9 +201,9 @@ define <4 x i32> @load_i32_insert_v4i32(i32* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -214,9 +215,9 @@ define <4 x i32> @casted_load_i32_insert_v4i32(<16 x i8>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %b = bitcast <16 x i8>* %p to i32* @@ -229,8 +230,9 @@ define <4 x float> @gep00_load_f32_insert_v4f32(<4 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float>* %p, i64 0, i64 0 @@ -243,8 +245,9 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(<4 x float> addrspace(44)* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float> addrspace(44)* [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> addrspace(44)* [[P:%.*]] to <2 x float> addrspace(44)* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(44)* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(44)* %p, i64 0, i64 0 @@ -258,9 +261,9 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(18) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 @@ -272,16 +275,12 @@ ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(17) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 2 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 16 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 2 @@ -292,16 +291,12 @@ ; Verify that alignment of the new load is not over-specified. define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(<8 x i16>* align 2 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[P:%.*]], align 2 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 0, i64 1 %s = load i16, i16* %gep, align 8 @@ -316,9 +311,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(16) %p) { ; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[B:%.*]] = bitcast i8* [[GEP]] to i32* -; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[GEP]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 1 @@ -330,9 +325,10 @@ define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(<16 x i8>* align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P:%.*]], i64 0, i64 12 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[GEP]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, <16 x i8>* %p, i64 0, i64 12 @@ -366,9 +362,9 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(<8 x i16>* align 16 dereferenceable(32) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <8 x i16>* -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -442,8 +438,9 @@ define <8 x i16> @gep10_load_i16_insert_v8i16_deref(<8 x i16>* align 16 dereferenceable(31) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16_deref( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[S:%.*]] = load i16, i16* [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 1, i64 0 @@ -470,9 +467,9 @@ define <4 x float> @load_f32_insert_v4f32_align(float* align 1 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_align( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -484,8 +481,9 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(15) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_deref( -; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -495,9 +493,9 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, i32* %p, align 4 @@ -507,8 +505,9 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32>* [[P:%.*]] to <2 x i32>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[R]] ; %b = bitcast <4 x i32>* %p to i32* @@ -519,9 +518,9 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v16f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <16 x i32> ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -531,9 +530,9 @@ define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, float* %p, align 4 @@ -586,9 +585,8 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(<2 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, <2 x float>* %p, align 4 @@ -599,9 +597,9 @@ define <4 x float> @load_v8f32_extract_insert_v4f32(<8 x float>* align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x float>* [[P:%.*]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 16 +; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <8 x float>, <8 x float>* %p, align 4 @@ -628,18 +626,12 @@ ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(<2 x i16>* align 1 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[GEP]], i32 0, i32 0 -; SSE2-NEXT: [[S:%.*]] = load i16, i16* [[TMP1]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; AVX2-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[P:%.*]] to <8 x i16>* -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 4 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, <2 x i16>* [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16>* [[GEP]] to <4 x i16>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, <2 x i16>* %p, i64 1 %l = load <2 x i16>, <2 x i16>* %gep, align 8