diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19846,13 +19846,14 @@ SelectionDAG &DAG) { if (!Shuf->getOperand(1).isUndef()) return SDValue(); - auto *Splat = dyn_cast(Shuf->getOperand(0)); - if (!Splat || !Splat->isSplat()) + + APInt UndefElts; + APInt DemandedElts = + APInt::getAllOnesValue(Shuf->getValueType(0).getVectorNumElements()); + if (!DAG.isSplatValue(Shuf->getOperand(0), DemandedElts, UndefElts)) return SDValue(); - ArrayRef ShufMask = Shuf->getMask(); - ArrayRef SplatMask = Splat->getMask(); - assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch"); + ArrayRef UserMask = Shuf->getMask(); // Prefer simplifying to the splat-shuffle, if possible. This is legal if // every undef mask element in the splat-shuffle has a corresponding undef @@ -19870,21 +19871,24 @@ // In this case the composed mask includes all undef elements of SplatMask // and in addition sets element zero to undef. It is safe to simplify to // the splat-shuffle. - auto CanSimplifyToExistingSplat = [](ArrayRef UserMask, - ArrayRef SplatMask) { - for (unsigned i = 0, e = UserMask.size(); i != e; ++i) - if (UserMask[i] != -1 && SplatMask[i] == -1 && - SplatMask[UserMask[i]] != -1) - return false; - return true; - }; - if (CanSimplifyToExistingSplat(ShufMask, SplatMask)) + bool CanSimplifyToExistingSplat = true; + for (unsigned i = 0, e = UserMask.size(); i != e; ++i) + if (UserMask[i] != -1 && UndefElts[i] && !UndefElts[UserMask[i]]) + CanSimplifyToExistingSplat = false; + + if (CanSimplifyToExistingSplat) return Shuf->getOperand(0); + // Unless the splat value is a shuffle as well, there is nothing more to do + auto *Splat = dyn_cast(Shuf->getOperand(0)); + if (!Splat) + return SDValue(); + // Create a new shuffle with a mask that is composed of the two shuffles' // masks. + ArrayRef SplatMask = Splat->getMask(); SmallVector NewMask; - for (int Idx : ShufMask) + for (int Idx : UserMask) NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -123,6 +123,9 @@ // Hoist bitcasts out of shuffles setTargetDAGCombine(ISD::VECTOR_SHUFFLE); + // Remove undef lanes from splats + setTargetDAGCombine(ISD::BUILD_VECTOR); + // Scalarize extracts of abs splats setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -1733,6 +1736,21 @@ return DAG.getBitcast(DstType, NewShuffle); } +static SDValue +performBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + auto &DAG = DCI.DAG; + auto *Build = cast(N); + + // Remove undef lanes from splats. They don't allow us to make any extra + // optimizations and they can inhibit splat scalarization combines. + BitVector UndefElts; + if (SDValue SplatVal = Build->getSplatValue(&UndefElts)) + if (UndefElts.any()) + return DAG.getSplatBuildVector(N->getValueType(0), SDLoc(N), SplatVal); + + return SDValue(); +} + static SDValue performEXTRACT_VECTOR_ELTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { @@ -1765,6 +1783,8 @@ return SDValue(); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); + case ISD::BUILD_VECTOR: + return performBUILD_VECTORCombine(N, DCI); case ISD::EXTRACT_VECTOR_ELT: return performEXTRACT_VECTOR_ELTCombine(N, DCI); } diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -93,10 +93,14 @@ ret <8 x i16> %v7 } +;; TODO: This should be a swizzle, but will need a custom combine to +;; preempt the combine that removes undef lanes from splat +;; build_vectors, since swizzle lowering depends on those lanes being +;; undef. + ; CHECK-LABEL: swizzle_one_i8x16: ; CHECK-NEXT: .functype swizzle_one_i8x16 (v128, v128) -> (v128) -; CHECK-NEXT: v8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 -; CHECK-NEXT: return $pop[[L0]] +; CHECK-NOT: v8x16.swizzle define <16 x i8> @swizzle_one_i8x16(<16 x i8> %src, <16 x i8> %mask) { %m0 = extractelement <16 x i8> %mask, i32 0 %s0 = extractelement <16 x i8> %src, i8 %m0 @@ -104,6 +108,20 @@ ret <16 x i8> %v0 } +; CHECK-LABEL: swizzle_two_i8x16: +; CHECK-NEXT: .functype swizzle_two_i8x16 (v128, v128) -> (v128) +; CHECK-NEXT: v8x16.swizzle +; CHECK-NEXT: return $pop[[L0]] +define <16 x i8> @swizzle_two_i8x16(<16 x i8> %src, <16 x i8> %mask) { + %m0 = extractelement <16 x i8> %mask, i32 0 + %s0 = extractelement <16 x i8> %src, i8 %m0 + %v0 = insertelement <16 x i8> undef, i8 %s0, i32 0 + %m1 = extractelement <16 x i8> %mask, i32 1 + %s1 = extractelement <16 x i8> %src, i8 %m1 + %v1 = insertelement <16 x i8> %v0, i8 %s1, i32 1 + ret <16 x i8> %v1 +} + ; CHECK-LABEL: swizzle_all_i8x16: ; CHECK-NEXT: .functype swizzle_all_i8x16 (v128, v128) -> (v128) ; CHECK-NEXT: v8x16.swizzle $push[[L0:[0-9]+]]=, $0, $1 @@ -245,7 +263,7 @@ ; CHECK-LABEL: undef_const_insert_f32x4: ; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x0p0, 0x1.5p5, 0x0p0, 0x0p0 +; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x1.5p5, 0x1.5p5 ; UNIMP-NEXT: return $pop[[L0]] ; SIMD-VM: f32x4.splat define <4 x float> @undef_const_insert_f32x4() { diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll --- a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -40,23 +40,17 @@ ret <16 x i8> %r } -;; TODO: Complete scalarization by removing the unnecessary shuffle - ; CHECK-LABEL: shl_abs_add: ; CHECK-NEXT: .functype shl_abs_add (v128, i32, i32) -> (v128) -; CHECK-NEXT: i8x16.splat $push1=, $1 -; CHECK-NEXT: i8x16.splat $push0=, $2 -; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 -; CHECK-NEXT: v8x16.shuffle $push3=, $pop2, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: i8x16.extract_lane_u $push11=, $pop3, 0 -; CHECK-NEXT: local.tee $push10=, $1=, $pop11 -; CHECK-NEXT: i32.const $push4=, 31 -; CHECK-NEXT: i32.shr_s $push9=, $1, $pop4 -; CHECK-NEXT: local.tee $push8=, $1=, $pop9 -; CHECK-NEXT: i32.add $push5=, $pop10, $pop8 -; CHECK-NEXT: i32.xor $push6=, $pop5, $1 -; CHECK-NEXT: i8x16.shl $push7=, $0, $pop6 -; CHECK-NEXT: return $pop7 +; CHECK-NEXT: i32.add $push7=, $1, $2 +; CHECK-NEXT: local.tee $push6=, $1=, $pop7 +; CHECK-NEXT: i32.const $push0=, 31 +; CHECK-NEXT: i32.shr_s $push5=, $1, $pop0 +; CHECK-NEXT: local.tee $push4=, $1=, $pop5 +; CHECK-NEXT: i32.add $push1=, $pop6, $pop4 +; CHECK-NEXT: i32.xor $push2=, $pop1, $1 +; CHECK-NEXT: i8x16.shl $push3=, $0, $pop2 +; CHECK-NEXT: return $pop3 define <16 x i8> @shl_abs_add(<16 x i8> %v, i8 %a, i8 %b) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -514,24 +514,21 @@ ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsrlq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] -; AVX1-NEXT: vpsrlq %xmm6, %xmm4, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpsrlq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsllq %xmm1, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm0, %ymm4, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -560,24 +560,21 @@ ; AVX1-LABEL: splatvar_funnnel_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,1,0,1] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpsrlq %xmm4, %xmm5, %xmm6 +; AVX1-NEXT: vpsrlq %xmm4, %xmm0, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpsllq %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,0,1] -; AVX1-NEXT: vpsllq %xmm6, %xmm4, %xmm7 -; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6,7] -; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsllq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpsllq %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vpsubq %xmm1, %xmm6, %xmm1 ; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq %xmm1, %xmm4, %xmm3 -; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: splatvar_funnnel_v4i64: