Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -3099,7 +3099,7 @@ // fold (and c1, c2) -> c1&c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); - ConstantSDNode *N1C = dyn_cast(N1); + ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS @@ -3119,14 +3119,14 @@ return RAND; // fold (and (or x, C), D) -> D if (C & D) == D if (N1C && N0.getOpcode() == ISD::OR) - if (ConstantSDNode *ORI = dyn_cast(N0.getOperand(1))) + if (ConstantSDNode *ORI = isConstOrConstSplat(N0.getOperand(1))) if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue()) return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); APInt Mask = ~N1C->getAPIntValue(); - Mask = Mask.trunc(N0Op0.getValueSizeInBits()); + Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); if (DAG.MaskedValueIsZero(N0Op0, Mask)) { SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N0.getValueType(), N0Op0); @@ -3177,7 +3177,7 @@ // that will apply equally to all members of the vector, so AND all the // lanes of the constant together. EVT VT = Vector->getValueType(0); - unsigned BitWidth = VT.getVectorElementType().getSizeInBits(); + unsigned BitWidth = VT.getScalarType().getSizeInBits(); // If the splat value has been compressed to a bitlength lower // than the size of the vector lane, we need to re-expand it to @@ -3251,9 +3251,9 @@ // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) - if (N1C && (N0.getOpcode() == ISD::LOAD || - (N0.getOpcode() == ISD::ANY_EXTEND && - N0.getOperand(0).getOpcode() == ISD::LOAD))) { + if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || + (N0.getOpcode() == ISD::ANY_EXTEND && + N0.getOperand(0).getOpcode() == ISD::LOAD))) { bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND; LoadSDNode *LN0 = HasAnyExt ? cast(N0.getOperand(0)) Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2016,6 +2016,26 @@ KnownOne = cast(Op)->getAPIntValue(); KnownZero = ~KnownOne; break; + case ISD::BUILD_VECTOR: + // Collect the known bits that are shared by every vector element. + KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + for (SDValue SrcOp : Op->ops()) { + computeKnownBits(SrcOp, KnownZero2, KnownOne2, Depth + 1); + + // BUILD_VECTOR can implicitly truncate sources, we must handle this. + if (SrcOp.getValueSizeInBits() != BitWidth) { + assert(SrcOp.getValueSizeInBits() > BitWidth && + "Expected BUILD_VECTOR implicit truncation"); + KnownOne2 = KnownOne2.trunc(BitWidth); + KnownZero2 = KnownZero2.trunc(BitWidth); + } + + // Known bits are the values that are shared by every element. + // TODO: support per-element known bits. + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + } + break; case ISD::AND: // If either the LHS or the RHS are Zero, the result is zero. computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1); Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -468,6 +468,33 @@ KnownOne = cast(Op)->getAPIntValue(); KnownZero = ~KnownOne; return false; // Don't fall through, will infinitely loop. + case ISD::BUILD_VECTOR: + // Collect the known bits that are shared by every constant vector element. + KnownZero = KnownOne = APInt::getAllOnesValue(BitWidth); + for (SDValue SrcOp : Op->ops()) { + if (!isa(SrcOp)) { + // We can only handle all constant values - bail out with no known bits. + KnownZero = KnownOne = APInt(BitWidth, 0); + return false; + } + KnownOne2 = cast(SrcOp)->getAPIntValue(); + KnownZero2 = ~KnownOne2; + + // BUILD_VECTOR can implicitly truncate sources, we must handle this. + if (KnownOne2.getBitWidth() != BitWidth) { + assert(KnownOne2.getBitWidth() > BitWidth && + KnownZero2.getBitWidth() > BitWidth && + "Expected BUILD_VECTOR implicit truncation"); + KnownOne2 = KnownOne2.trunc(BitWidth); + KnownZero2 = KnownZero2.trunc(BitWidth); + } + + // Known bits are the values that are shared by every element. + // TODO: support per-element known bits. + KnownOne &= KnownOne2; + KnownZero &= KnownZero2; + } + return false; // Don't fall through, will infinitely loop. case ISD::AND: // If the RHS is a constant, check to see if the LHS would be zero without // using the bits from the RHS. Below, we use knowledge about the RHS to Index: test/CodeGen/AMDGPU/load-constant-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-constant-i16.ll +++ test/CodeGen/AMDGPU/load-constant-i16.ll @@ -138,7 +138,7 @@ ; v2i16 is naturally 4 byte aligned ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; TODO: This should use DST, but for some there are redundant MOVs -; EG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal ; EG: 16 define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(2)* %in @@ -212,9 +212,10 @@ ; v4i16 is naturally 8 byte aligned ; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal ; EG-DAG: 16 +; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal +; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: 16 define void @constant_constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(2)* %in Index: test/CodeGen/AMDGPU/load-global-i16.ll =================================================================== --- test/CodeGen/AMDGPU/load-global-i16.ll +++ test/CodeGen/AMDGPU/load-global-i16.ll @@ -147,7 +147,7 @@ ; EG: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1 ; TODO: This should use DST, but for some there are redundant MOVs -; EG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal +; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal ; EG: 16 define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in @@ -219,9 +219,10 @@ ; EG: VTX_READ_64 [[DST:T[0-9]\.XY]], {{T[0-9].[XYZW]}}, 0, #1 ; TODO: These should use DST, but for some there are redundant MOVs -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{PV.[XYZW]}}, literal -; EG-DAG: LSHR {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal +; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: 16 +; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal +; EG-DAG: AND_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{T[0-9].[XYZW]}}, literal ; EG-DAG: 16 define void @global_global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { %load = load <4 x i16>, <4 x i16> addrspace(1)* %in Index: test/CodeGen/X86/combine-and.ll =================================================================== --- test/CodeGen/X86/combine-and.ll +++ test/CodeGen/X86/combine-and.ll @@ -186,8 +186,7 @@ define <2 x i64> @and_or_v2i64(<2 x i64> %a0) { ; CHECK-LABEL: and_or_v2i64: ; CHECK: # BB#0: -; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,8] ; CHECK-NEXT: retq %1 = or <2 x i64> %a0, %2 = and <2 x i64> %1, @@ -197,8 +196,7 @@ define <4 x i32> @and_or_v4i32(<4 x i32> %a0) { ; CHECK-LABEL: and_or_v4i32: ; CHECK: # BB#0: -; CHECK-NEXT: orps {{.*}}(%rip), %xmm0 -; CHECK-NEXT: andps {{.*}}(%rip), %xmm0 +; CHECK-NEXT: movaps {{.*#+}} xmm0 = [3,3,3,3] ; CHECK-NEXT: retq %1 = or <4 x i32> %a0, %2 = and <4 x i32> %1, @@ -212,10 +210,7 @@ define <2 x i64> @and_or_zext_v2i32(<2 x i32> %a0) { ; CHECK-LABEL: and_or_zext_v2i32: ; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: retq %1 = zext <2 x i32> %a0 to <2 x i64> %2 = or <2 x i64> %1, @@ -226,10 +221,7 @@ define <4 x i32> @and_or_zext_v4i16(<4 x i16> %a0) { ; CHECK-LABEL: and_or_zext_v4i16: ; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm1, %xmm1 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; CHECK-NEXT: por {{.*}}(%rip), %xmm0 -; CHECK-NEXT: pand {{.*}}(%rip), %xmm0 +; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: retq %1 = zext <4 x i16> %a0 to <4 x i32> %2 = or <4 x i32> %1,