Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8033,6 +8033,7 @@ /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. +/// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef Mask, const X86Subtarget *Subtarget, @@ -8105,6 +8106,20 @@ // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); + } else if (MayFoldLoad(V)) { + // If we are broadcasting a load that is only used by the shuffle + // then we can reduce the vector load to the broadcasted scalar load. + LoadSDNode *Ld = cast(V); + SDValue BaseAddr = Ld->getOperand(1); + EVT AddrVT = BaseAddr.getValueType(); + EVT SVT = VT.getScalarType(); + unsigned Offset = BroadcastIdx * SVT.getStoreSize(); + SDValue NewAddr = DAG.getNode( + ISD::ADD, DL, AddrVT, BaseAddr, + DAG.getConstant(Offset, DL, AddrVT)); + V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. @@ -26312,7 +26327,7 @@ // If we're negating a FMUL node on a target with FMA, then we can avoid the // use of a constant by performing (-0 - A*B) instead. - // FIXME: Check rounding control flags as well once it becomes available. + // FIXME: Check rounding control flags as well once it becomes available. if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && Arg->getFlags()->hasNoSignedZeros() && Subtarget->hasAnyFMA()) { SDValue Zero = DAG.getConstantFP(0.0, DL, VT); Index: test/CodeGen/X86/avx-splat.ll =================================================================== --- test/CodeGen/X86/avx-splat.ll +++ test/CodeGen/X86/avx-splat.ll @@ -145,10 +145,7 @@ define <4 x double> @splat_load_4f64_2222(<4 x double>* %ptr) { ; CHECK-LABEL: splat_load_4f64_2222: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; CHECK-NEXT: retq %x = load <4 x double>, <4 x double>* %ptr %x1 = shufflevector <4 x double> %x, <4 x double> undef, <4 x i32> @@ -158,7 +155,7 @@ define <4 x float> @splat_load_4f32_0000(<4 x float>* %ptr) { ; CHECK-LABEL: splat_load_4f32_0000: ; CHECK: ## BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,0,0] +; CHECK-NEXT: vbroadcastss (%rdi), %xmm0 ; CHECK-NEXT: retq %x = load <4 x float>, <4 x float>* %ptr %x1 = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -168,10 +165,7 @@ define <8 x float> @splat_load_8f32_77777777(<8 x float>* %ptr) { ; CHECK-LABEL: splat_load_8f32_77777777: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastss 28(%rdi), %ymm0 ; CHECK-NEXT: retq %x = load <8 x float>, <8 x float>* %ptr %x1 = shufflevector <8 x float> %x, <8 x float> undef, <8 x i32> Index: test/CodeGen/X86/avx-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx-vbroadcast.ll +++ test/CodeGen/X86/avx-vbroadcast.ll @@ -144,7 +144,7 @@ define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f32_4f32_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; CHECK-NEXT: vbroadcastss 4(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -155,8 +155,7 @@ define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8f32_4f32_33333333: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[3,3,3,3] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastss 12(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -167,10 +166,7 @@ define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8f32_8f32_55555555: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovaps (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastss 20(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr @@ -231,9 +227,7 @@ define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f64_2f64_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vmovhlps {{.*#+}} xmm0 = xmm0[1,1] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastsd 8(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr @@ -244,10 +238,7 @@ define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f64_4f64_2222: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovapd (%rdi), %ymm0 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x double>, <4 x double>* %ptr Index: test/CodeGen/X86/avx2-vbroadcast.ll =================================================================== --- test/CodeGen/X86/avx2-vbroadcast.ll +++ test/CodeGen/X86/avx2-vbroadcast.ll @@ -177,8 +177,7 @@ define <16 x i8> @load_splat_16i8_16i8_1111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_16i8_16i8_1111111111111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; CHECK-NEXT: vpbroadcastb 1(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -189,9 +188,7 @@ define <32 x i8> @load_splat_32i8_16i8_11111111111111111111111111111111(<16 x i8>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_32i8_16i8_11111111111111111111111111111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastb 1(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -202,9 +199,7 @@ define <32 x i8> @load_splat_32i8_32i8_11111111111111111111111111111111(<32 x i8>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_32i8_32i8_11111111111111111111111111111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastb 1(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <32 x i8>, <32 x i8>* %ptr @@ -215,8 +210,7 @@ define <8 x i16> @load_splat_8i16_8i16_11111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8i16_8i16_11111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -227,9 +221,7 @@ define <16 x i16> @load_splat_16i16_8i16_1111111111111111(<8 x i16>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_16i16_8i16_1111111111111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastw 2(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -240,9 +232,7 @@ define <16 x i16> @load_splat_16i16_16i16_1111111111111111(<16 x i16>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_16i16_16i16_1111111111111111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vpbroadcastw 2(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <16 x i16>, <16 x i16>* %ptr @@ -253,7 +243,7 @@ define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4i32_4i32_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,1,1,1] +; CHECK-NEXT: vbroadcastss 4(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -264,9 +254,7 @@ define <8 x i32> @load_splat_8i32_4i32_33333333(<4 x i32>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8i32_4i32_33333333: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpbroadcastd LCPI15_0(%rip), %ymm1 -; CHECK-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vbroadcastss 12(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -277,8 +265,7 @@ define <8 x i32> @load_splat_8i32_8i32_55555555(<8 x i32>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8i32_8i32_55555555: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpbroadcastd LCPI16_0(%rip), %ymm0 -; CHECK-NEXT: vpermd (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastss 20(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <8 x i32>, <8 x i32>* %ptr @@ -289,7 +276,7 @@ define <4 x float> @load_splat_4f32_4f32_1111(<4 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f32_4f32_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,1,1,1] +; CHECK-NEXT: vbroadcastss 4(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -300,9 +287,7 @@ define <8 x float> @load_splat_8f32_4f32_33333333(<4 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8f32_4f32_33333333: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovaps (%rdi), %xmm0 -; CHECK-NEXT: vbroadcastss LCPI18_0(%rip), %ymm1 -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vbroadcastss 12(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -313,8 +298,7 @@ define <8 x float> @load_splat_8f32_8f32_55555555(<8 x float>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_8f32_8f32_55555555: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vbroadcastss LCPI19_0(%rip), %ymm0 -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vbroadcastss 20(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <8 x float>, <8 x float>* %ptr @@ -325,7 +309,7 @@ define <2 x i64> @load_splat_2i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_2i64_2i64_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,3,2,3] +; CHECK-NEXT: vpbroadcastq 8(%rdi), %xmm0 ; CHECK-NEXT: retq entry: %ld = load <2 x i64>, <2 x i64>* %ptr @@ -336,8 +320,7 @@ define <4 x i64> @load_splat_4i64_2i64_1111(<2 x i64>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4i64_2i64_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,1,1,1] +; CHECK-NEXT: vbroadcastsd 8(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <2 x i64>, <2 x i64>* %ptr @@ -348,7 +331,7 @@ define <4 x i64> @load_splat_4i64_4i64_2222(<4 x i64>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4i64_4i64_2222: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = mem[2,2,2,2] +; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x i64>, <4 x i64>* %ptr @@ -371,8 +354,7 @@ define <4 x double> @load_splat_4f64_2f64_1111(<2 x double>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f64_2f64_1111: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vmovapd (%rdi), %xmm0 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1] +; CHECK-NEXT: vbroadcastsd 8(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <2 x double>, <2 x double>* %ptr @@ -383,7 +365,7 @@ define <4 x double> @load_splat_4f64_4f64_2222(<4 x double>* %ptr) nounwind uwtable readnone ssp { ; CHECK-LABEL: load_splat_4f64_4f64_2222: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,2,2,2] +; CHECK-NEXT: vbroadcastsd 16(%rdi), %ymm0 ; CHECK-NEXT: retq entry: %ld = load <4 x double>, <4 x double>* %ptr Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1357,8 +1357,7 @@ ; ; AVX512VL-LABEL: splat_mem_v4i64_from_v2i64: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0 -; AVX512VL-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512VL-NEXT: vpbroadcastq (%rdi), %ymm0 ; AVX512VL-NEXT: retq %v = load <2 x i64>, <2 x i64>* %ptr %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> @@ -1366,21 +1365,10 @@ } define <4 x double> @splat_mem_v4f64_from_v2f64(<2 x double>* %ptr) { -; AVX1-LABEL: splat_mem_v4f64_from_v2f64: -; AVX1: # BB#0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splat_mem_v4f64_from_v2f64: -; AVX2: # BB#0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512VL-LABEL: splat_mem_v4f64_from_v2f64: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; ALL-LABEL: splat_mem_v4f64_from_v2f64: +; ALL: # BB#0: +; ALL-NEXT: vbroadcastsd (%rdi), %ymm0 +; ALL-NEXT: retq %v = load <2 x double>, <2 x double>* %ptr %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> ret <4 x double> %shuffle