Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1899,14 +1899,30 @@ } break; } - case Intrinsic::masked_load: + case Intrinsic::masked_load: { + auto VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) + if (V != II) + return replaceInstUsesWith(*II, V); + if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II, Builder)) return replaceInstUsesWith(CI, SimplifiedMaskedOp); break; + } case Intrinsic::masked_store: return simplifyMaskedStore(*II, *this); - case Intrinsic::masked_gather: + case Intrinsic::masked_gather: { + auto VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) + if (V != II) + return replaceInstUsesWith(*II, V); + return simplifyMaskedGather(*II, *this); + } case Intrinsic::masked_scatter: return simplifyMaskedScatter(*II, *this); case Intrinsic::launder_invariant_group: Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1441,6 +1441,29 @@ IntrinsicInst *II = dyn_cast(I); if (!II) break; switch (II->getIntrinsicID()) { + case Intrinsic::masked_gather: // fallthrough + case Intrinsic::masked_load: { + // Subtle - We *can't* simplify the zero bits of the mask just because + // they're unused because that might introduce a fault. + // TODO: but we can fold them to zero in a constant mask + APInt DemandedLoad(DemandedElts), DemandedPassThrough(DemandedElts); + if (auto *CV = dyn_cast(II->getOperand(2))) + for (unsigned i = 0; i < VWidth; i++) { + Constant *CElt = CV->getAggregateElement(i); + if (CElt->isNullValue()) + DemandedLoad.clearBit(i); + else if (CElt->isAllOnesValue()) + DemandedPassThrough.clearBit(i); + } + if (II->getIntrinsicID() == Intrinsic::masked_gather) + simplifyAndSetOp(II, 0, DemandedLoad, UndefElts2); + simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3); + + // Output elements are undefined if the element from both sources are. + // TODO: can strengthen via mask as well. + UndefElts = UndefElts2 & UndefElts3; + break; + } case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: // The instructions for these intrinsics are speced to zero upper bits not Index: test/Transforms/InstCombine/masked_intrinsics.ll =================================================================== --- test/Transforms/InstCombine/masked_intrinsics.ll +++ test/Transforms/InstCombine/masked_intrinsics.ll @@ -37,8 +37,7 @@ define <2 x double> @load_lane0(<2 x double>* %ptr, double %pt) { ; CHECK-LABEL: @load_lane0( -; CHECK-NEXT: [[PTV1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0 -; CHECK-NEXT: [[PTV2:%.*]] = shufflevector <2 x double> [[PTV1]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[PTV2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1 ; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* [[PTR:%.*]], i32 2, <2 x i1> , <2 x double> [[PTV2]]) ; CHECK-NEXT: ret <2 x double> [[RES]] ; @@ -81,7 +80,7 @@ define <2 x double> @gather_onemask(<2 x double*> %ptrs, <2 x double> %passthru) { ; CHECK-LABEL: @gather_onemask( -; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> , <2 x double> [[PASSTHRU:%.*]]) +; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS:%.*]], i32 5, <2 x i1> , <2 x double> undef) ; CHECK-NEXT: ret <2 x double> [[RES]] ; %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> , <2 x double> %passthru) @@ -91,9 +90,8 @@ define <2 x double> @gather_lane0(double* %base, double %pt) { ; CHECK-LABEL: @gather_lane0( -; CHECK-NEXT: [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> -; CHECK-NEXT: [[PT_V1:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 0 -; CHECK-NEXT: [[PT_V2:%.*]] = shufflevector <2 x double> [[PT_V1]], <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[PTRS:%.*]] = getelementptr double, double* [[BASE:%.*]], <2 x i64> +; CHECK-NEXT: [[PT_V2:%.*]] = insertelement <2 x double> undef, double [[PT:%.*]], i64 1 ; CHECK-NEXT: [[RES:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> [[PTRS]], i32 5, <2 x i1> , <2 x double> [[PT_V2]]) ; CHECK-NEXT: ret <2 x double> [[RES]] ;