diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1711,11 +1711,6 @@ bool MaskedValueIsZero(SDValue Op, const APInt &Mask, const APInt &DemandedElts, unsigned Depth = 0) const; - /// Return true if the DemandedElts of the vector Op are all zero. We - /// use this predicate to simplify operations downstream. - bool MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts, - unsigned Depth = 0) const; - /// Return true if '(Op & Mask) == Mask'. /// Op and Mask are known to be the same type. bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2446,19 +2446,6 @@ return Mask.isSubsetOf(computeKnownBits(V, DemandedElts, Depth).Zero); } -/// Return true if the DemandedElts of the vector Op are all zero. We -/// use this predicate to simplify operations downstream. -bool SelectionDAG::MaskedElementsAreZero(SDValue Op, const APInt &DemandedElts, - unsigned Depth) const { - assert(Op.getValueType().isFixedLengthVector() && - Op.getValueType().getVectorNumElements() == - DemandedElts.getBitWidth() && - "MaskedElementsAreZero vector size mismatch"); - unsigned BitWidth = Op.getScalarValueSizeInBits(); - APInt DemandedBits = APInt::getAllOnesValue(BitWidth); - return MaskedValueIsZero(Op, DemandedBits, DemandedElts, Depth); -} - /// MaskedValueIsAllOnes - Return true if '(Op & Mask) == Mask'. bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask, unsigned Depth) const { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35974,15 +35974,6 @@ } } - // See if this is a blend with zero - in which case check if the zero'd - // elements are already zero. - if (isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0)) { - assert(!KnownZero.isNullValue() && "Shuffle has no zero elements"); - SDValue NewV1 = CanonicalizeShuffleInput(MaskVT, V1); - if (DAG.MaskedElementsAreZero(NewV1, KnownZero)) - return DAG.getBitcast(RootVT, NewV1); - } - SDValue NewV1 = V1; // Save operand in case early exit happens. if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, diff --git a/llvm/test/CodeGen/X86/fptoui-may-overflow.ll b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/fptoui-may-overflow.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s + +; @fptoui_zext is legal to optimize to a single vcvttps2dq: if one of the i8 +; results of fptoui is poisoned, the corresponding i32 result of the zext is +; also poisoned. We currently don't implement this optimization. + +define <16 x i8> @fptoui_zext(<4 x float> %arg) { +; CHECK-LABEL: fptoui_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: retq + %f = fptoui <4 x float> %arg to <4 x i8> + %z = zext <4 x i8> %f to <4 x i32> + %b = bitcast <4 x i32> %z to <16 x i8> + ret <16 x i8> %b +} + +; In @fptoui_shuffle, we must preserve the vpand for correctnesss. Only the +; i8 values extracted from %s are poison. The values from the zeroinitializer +; are not. + +define <16 x i8> @fptoui_shuffle(<4 x float> %arg) { +; CHECK-LABEL: fptoui_shuffle: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq + %f = fptoui <4 x float> %arg to <4 x i8> + %s = shufflevector <4 x i8> %f, <4 x i8> undef, <16 x i32> + %ss = shufflevector <16 x i8> %s, <16 x i8> zeroinitializer, <16 x i32> + ret <16 x i8> %ss +} diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -2264,8 +2264,8 @@ ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -2291,8 +2291,8 @@ ; XOP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOP-NEXT: vpinsrd $2, 8(%rdi), %xmm0, %xmm1 ; XOP-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3,4,5,6,7] -; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7] +; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6,7] ; XOP-NEXT: retq %1 = load <3 x i32>, <3 x i32>* %ptr, align 1 diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -3088,13 +3088,24 @@ ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: psrad $1, %xmm0 +; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vpsrad $1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsrad $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i64_x_2_18446744065119617024_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsrad $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; X64-AVX2-NEXT: retq %t0 = and <2 x i64> %a0, %t1 = ashr <2 x i64> %t0, ret <2 x i64> %t1