diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -22641,6 +22641,56 @@ } } + // If we're not performing a select/blend shuffle, see if we can convert the + // shuffle into a AND node, with all the out-of-lane elements are known zero. + if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { + bool IsInLaneMask = true; + ArrayRef Mask = SVN->getMask(); + SmallVector ClearMask(NumElts, -1); + APInt DemandedLHS = APInt::getNullValue(NumElts); + APInt DemandedRHS = APInt::getNullValue(NumElts); + for (int I = 0; I != (int)NumElts; ++I) { + int M = Mask[I]; + if (M < 0) + continue; + ClearMask[I] = M == I ? I : (I + NumElts); + IsInLaneMask &= (M == I) || (M == (I + NumElts)); + if (M != I) { + APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS; + Demanded.setBit(M % NumElts); + } + } + // TODO: Should we try to mask with N1 as well? + if (!IsInLaneMask && + (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) && + (DemandedLHS.isNullValue() || + DAG.MaskedVectorIsZero(N0, DemandedLHS)) && + (DemandedRHS.isNullValue() || + DAG.MaskedVectorIsZero(N1, DemandedRHS))) { + SDLoc DL(N); + EVT IntVT = VT.changeVectorElementTypeToInteger(); + EVT IntSVT = VT.getVectorElementType().changeTypeToInteger(); + SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT); + SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT); + SmallVector AndMask(NumElts, DAG.getUNDEF(IntSVT)); + for (int I = 0; I != (int)NumElts; ++I) + if (0 <= Mask[I]) + AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt; + + // See if a clear mask is legal instead of going via + // XformToShuffleWithZero which loses UNDEF mask elements. + if (TLI.isVectorClearMaskLegal(ClearMask, IntVT)) + return DAG.getBitcast( + VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0), + DAG.getConstant(0, DL, IntVT), ClearMask)); + + if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT)) + return DAG.getBitcast( + VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0), + DAG.getBuildVector(IntVT, DL, AndMask))); + } + } + // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -549,6 +549,10 @@ /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; + /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero' + /// shuffle mask can be codegen'd directly. + bool isVectorClearMaskLegal(ArrayRef M, EVT VT) const override; + /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -11811,6 +11811,12 @@ isConcatMask(M, VT, VT.getSizeInBits() == 128)); } +bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef M, + EVT VT) const { + // Just delegate to the generic legality, clear masks aren't special. + return isShuffleMaskLegal(M, VT); +} + /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll --- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll +++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll @@ -4,8 +4,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_undef(<4 x i32> %x) { ; CHECK-LABEL: extract0_i32_zext_insert0_i64_undef: ; CHECK: // %bb.0: -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v0.s[1], wzr ; CHECK-NEXT: ret %e = extractelement <4 x i32> %x, i32 0 %z = zext i32 %e to i64 diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -907,23 +907,11 @@ ret <8 x i8> %c } -; CHECK-LABEL: .LCPI90_0: -; CHECK-NEXT: .byte 0 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 2 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 6 -; CHECK-NEXT: .byte 7 define <8 x i8> @vselect_equivalent_shuffle_v8i8_zero(<8 x i8> %a) { ; CHECK-LABEL: vselect_equivalent_shuffle_v8i8_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI90_0 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.d[1], v0.d[0] -; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI90_0] -; CHECK-NEXT: tbl v0.8b, { v0.16b }, v1.8b +; CHECK-NEXT: movi d1, #0xffffffff00ff00ff +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret %c = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <8 x i32> ret <8 x i8> %c @@ -982,28 +970,20 @@ } ; CHECK-LABEL: .LCPI93_0: -; CHECK-NEXT: .byte 0 -; CHECK-NEXT: .byte 1 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 4 -; CHECK-NEXT: .byte 5 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 255 -; CHECK-NEXT: .byte 8 -; CHECK-NEXT: .byte 9 -; CHECK-NEXT: .byte 10 -; CHECK-NEXT: .byte 11 -; CHECK-NEXT: .byte 12 -; CHECK-NEXT: .byte 13 -; CHECK-NEXT: .byte 14 -; CHECK-NEXT: .byte 15 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 0 // 0x0 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 0 // 0x0 +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff +; CHECK-NEXT: .hword 65535 // 0xffff define <8 x i16> @vselect_equivalent_shuffle_v8i16_zero(<8 x i16> %a) { ; CHECK-LABEL: vselect_equivalent_shuffle_v8i16_zero: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI93_0 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI93_0] -; CHECK-NEXT: tbl v0.16b, { v0.16b }, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %c = shufflevector <8 x i16> %a, <8 x i16> zeroinitializer, <8 x i32> ret <8 x i16> %c diff --git a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll --- a/llvm/test/CodeGen/ARM/vector-DAGCombine.ll +++ b/llvm/test/CodeGen/ARM/vector-DAGCombine.ll @@ -56,7 +56,6 @@ ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: @ %bb.1: @ %bb1.preheader ; CHECK-NEXT: vmov.i32 q8, #0x0 -; CHECK-NEXT: vext.8 q8, q8, q8, #4 ; CHECK-NEXT: .LBB3_2: @ %bb2 ; CHECK-NEXT: vmov r0, r1, d16 ; CHECK-NEXT: vmov r2, r3, d17 diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -1058,47 +1058,19 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_double_reduction: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_double_reduction: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_double_reduction: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: sad_double_reduction: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 @@ -1148,47 +1120,19 @@ ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; -; AVX1-LABEL: sad_double_reduction_abs: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu (%rdx), %xmm1 -; AVX1-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: sad_double_reduction_abs: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu (%rdx), %xmm1 -; AVX2-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: sad_double_reduction_abs: -; AVX512: # %bb.0: # %bb -; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqu (%rdx), %xmm1 -; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 -; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq +; AVX-LABEL: sad_double_reduction_abs: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vmovdqu (%rdi), %xmm0 +; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqu (%rdx), %xmm1 +; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq bb: %tmp = load <16 x i8>, <16 x i8>* %arg, align 1 %tmp4 = load <16 x i8>, <16 x i8>* %arg1, align 1 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1013,21 +1013,21 @@ ; SSE2-LABEL: shuffle_v4f32_0z2z: ; SSE2: # %bb.0: ; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0z2z: ; SSE3: # %bb.0: ; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0z2z: ; SSSE3: # %bb.0: ; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSSE3-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2249,9 +2249,7 @@ define <8 x i64> @test_v8i64_insert_zero_128(<8 x i64> %a) { ; ALL-LABEL: test_v8i64_insert_zero_128: ; ALL: # %bb.0: -; ALL-NEXT: movb $3, %al -; ALL-NEXT: kmovw %eax, %k1 -; ALL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: vmovaps %xmm0, %xmm0 ; ALL-NEXT: ret{{[l|q]}} %res = shufflevector <8 x i64> %a, <8 x i64> , <8 x i32> ret <8 x i64> %res