diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12666,6 +12666,14 @@ return SDValue(); } +static bool isBroadcastShuffleMask(ArrayRef Mask) { + return isUndefOrEqual(Mask, 0); +} + +static bool isNoopOrBroadcastShuffleMask(ArrayRef Mask) { + return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask); +} + /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// @@ -12699,6 +12707,38 @@ } } + // If we effectively only demand the 0'th element of \p Input, and not only + // as 0'th element, then broadcast said input, + // and change \p InputMask to be a no-op (identity) mask. + auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget, + &DAG](SDValue &Input, + MutableArrayRef InputMask) { + unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); + if (!Subtarget.hasAVX2() && + (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input))) + return; + if (isNoopShuffleMask(InputMask)) + return; + assert(isBroadcastShuffleMask(InputMask) && + "Expected to demand only the 0'th element."); + Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input); + for (auto I : enumerate(InputMask)) { + int &InputMaskElt = I.value(); + if (InputMaskElt >= 0) + InputMaskElt = I.index(); + } + }; + + // Currently, we may need to produce one shuffle per input, and blend results. + // It is possible that the shuffle for one of the inputs is already a no-op. + // See if we can simplify non-no-op shuffles into broadcasts, + // which we consider to be strictly better than an arbitrary shuffle. + if (isNoopOrBroadcastShuffleMask(V1Mask) && + isNoopOrBroadcastShuffleMask(V2Mask)) { + canonicalizeBroadcastableInput(V1, V1Mask); + canonicalizeBroadcastableInput(V2, V2Mask); + } + // Try to lower with the simpler initial blend/unpack/rotate strategies unless // one of the input shuffles would be a no-op. We prefer to shuffle inputs as // the shuffle may be able to fold with a load or other benefit. However, when diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -284,7 +284,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1 +; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1] ; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: retq %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> @@ -741,7 +741,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -763,8 +763,7 @@ define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary(<32 x i8> %x, <32 x i8> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -800,7 +799,7 @@ ; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] +; CHECK-NEXT: vpbroadcastb %xmm1, %ymm1 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -143,9 +143,9 @@ ; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3] ; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32: @@ -458,20 +458,19 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4 ; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,3] -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3] +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: retq @@ -752,16 +751,16 @@ ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm4 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm5 +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,2,2,2] -; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: sequential_sum_v4i32_v4i32: @@ -776,15 +775,15 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2] -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ; AVX2-FAST-NEXT: retq %5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> %6 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> @@ -1089,28 +1088,28 @@ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSSE3-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: reduction_sum_v4i32_v4i32: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] -; AVX-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 -; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 -; AVX-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-SLOW-NEXT: vpaddd %xmm6, %xmm3, %xmm3 +; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX1-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX1-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32: ; AVX-FAST: # %bb.0: @@ -1126,6 +1125,28 @@ ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX-FAST-NEXT: retq +; +; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm5, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0) %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1) %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)