Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -834,11 +834,12 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) { Value *Ptr = II.getOperand(0); Value *Mask = II.getOperand(1); + Constant *ZeroVec = Constant::getNullValue(II.getType()); // Special case a zero mask since that's not a ConstantDataVector. - // This masked load instruction does nothing, so return an undef. + // This masked load instruction creates a zero vector. if (isa(Mask)) - return IC.replaceInstUsesWith(II, UndefValue::get(II.getType())); + return IC.replaceInstUsesWith(II, ZeroVec); auto *ConstMask = dyn_cast(Mask); if (!ConstMask) @@ -857,7 +858,9 @@ // on each element's most significant bit (the sign bit). Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask); - CallInst *NewMaskedLoad = IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask); + // The pass-through vector for an x86 masked load is a zero vector. + CallInst *NewMaskedLoad = + IC.Builder->CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec); return IC.replaceInstUsesWith(II, NewMaskedLoad); } Index: llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-masked-memops.ll @@ -13,14 +13,14 @@ ; CHECK-NEXT: ret <4 x float> %ld } -; Zero mask is a nop. +; Zero mask returns a zero vector. define <4 x float> @mload_zeros(i8* %f) { %ld = tail call <4 x float> @llvm.x86.avx.maskload.ps(i8* %f, <4 x i32> zeroinitializer) ret <4 x float> %ld ; CHECK-LABEL: @mload_zeros( -; CHECK-NEXT: ret <4 x float> undef +; CHECK-NEXT: ret <4 x float> zeroinitializer } ; Only the sign bit matters. @@ -30,7 +30,7 @@ ret <4 x float> %ld ; CHECK-LABEL: @mload_fake_ones( -; CHECK-NEXT: ret <4 x float> undef +; CHECK-NEXT: ret <4 x float> zeroinitializer } ; All mask bits are set, so this is just a vector load. @@ -53,7 +53,7 @@ ; CHECK-LABEL: @mload_one_one( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x float>* -; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %castvec, i32 1, <4 x i1> , <4 x float> zeroinitializer) ; CHECK-NEXT: ret <4 x float> %1 } @@ -65,7 +65,7 @@ ; CHECK-LABEL: @mload_one_one_double( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x double>* -; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> undef) +; CHECK-NEXT: %1 = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %castvec, i32 1, <2 x i1> , <2 x double> zeroinitializer) ; CHECK-NEXT: ret <2 x double> %1 } @@ -77,7 +77,7 @@ ; CHECK-LABEL: @mload_v8f32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x float>* -; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: %1 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %castvec, i32 1, <8 x i1> , <8 x float> zeroinitializer) ; CHECK-NEXT: ret <8 x float> %1 } @@ -87,7 +87,7 @@ ; CHECK-LABEL: @mload_v4f64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x double>* -; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: %1 = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %castvec, i32 1, <4 x i1> , <4 x double> zeroinitializer) ; CHECK-NEXT: ret <4 x double> %1 } @@ -99,7 +99,7 @@ ; CHECK-LABEL: @mload_v4i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i32>* -; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %castvec, i32 1, <4 x i1> , <4 x i32> zeroinitializer) ; CHECK-NEXT: ret <4 x i32> %1 } @@ -109,7 +109,7 @@ ; CHECK-LABEL: @mload_v2i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <2 x i64>* -; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> undef) +; CHECK-NEXT: %1 = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %castvec, i32 1, <2 x i1> , <2 x i64> zeroinitializer) ; CHECK-NEXT: ret <2 x i64> %1 } @@ -119,7 +119,7 @@ ; CHECK-LABEL: @mload_v8i32( ; CHECK-NEXT: %castvec = bitcast i8* %f to <8 x i32>* -; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: %1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %castvec, i32 1, <8 x i1> , <8 x i32> zeroinitializer) ; CHECK-NEXT: ret <8 x i32> %1 } @@ -129,7 +129,7 @@ ; CHECK-LABEL: @mload_v4i64( ; CHECK-NEXT: %castvec = bitcast i8* %f to <4 x i64>* -; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> undef) +; CHECK-NEXT: %1 = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %castvec, i32 1, <4 x i1> , <4 x i64> zeroinitializer) ; CHECK-NEXT: ret <4 x i64> %1 }