Index: lib/Analysis/ConstantFolding.cpp =================================================================== --- lib/Analysis/ConstantFolding.cpp +++ lib/Analysis/ConstantFolding.cpp @@ -442,8 +442,8 @@ return nullptr; GlobalValue *GVal; - APInt Offset; - if (!IsConstantOffsetFromGlobal(C, GVal, Offset, DL)) + APInt OffsetAI; + if (!IsConstantOffsetFromGlobal(C, GVal, OffsetAI, DL)) return nullptr; auto *GV = dyn_cast(GVal); @@ -451,19 +451,29 @@ !GV->getInitializer()->getType()->isSized()) return nullptr; - // If we're loading off the beginning of the global, some bytes may be valid, - // but we don't try to handle this. - if (Offset.isNegative()) - return nullptr; + int64_t Offset = OffsetAI.getSExtValue(); + int64_t InitializerSize = DL.getTypeAllocSize(GV->getInitializer()->getType()); + + // If we're not accessing anything in this constant, the result is undefined. + if (Offset + BytesLoaded <= 0) + return UndefValue::get(IntType); // If we're not accessing anything in this constant, the result is undefined. - if (Offset.getZExtValue() >= - DL.getTypeAllocSize(GV->getInitializer()->getType())) + if (Offset >= InitializerSize) return UndefValue::get(IntType); unsigned char RawBytes[32] = {0}; - if (!ReadDataFromGlobal(GV->getInitializer(), Offset.getZExtValue(), RawBytes, - BytesLoaded, DL)) + unsigned char *CurPtr = RawBytes; + unsigned BytesLeft = BytesLoaded; + + // If we're loading off the beginning of the global, some bytes may be valid. + if (Offset < 0) { + CurPtr += -Offset; + BytesLeft += Offset; + Offset = 0; + } + + if (!ReadDataFromGlobal(GV->getInitializer(), Offset, CurPtr, BytesLeft, DL)) return nullptr; APInt ResultVal = APInt(IntType->getBitWidth(), 0); @@ -1289,6 +1299,7 @@ case Intrinsic::fmuladd: case Intrinsic::copysign: case Intrinsic::round: + case Intrinsic::masked_load: case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: @@ -1833,11 +1844,41 @@ Constant *ConstantFoldVectorCall(StringRef Name, unsigned IntrinsicID, VectorType *VTy, ArrayRef Operands, + const DataLayout &DL, const TargetLibraryInfo *TLI) { SmallVector Result(VTy->getNumElements()); SmallVector Lane(Operands.size()); Type *Ty = VTy->getElementType(); + if (IntrinsicID == Intrinsic::masked_load) { + auto *SrcPtr = Operands[0]; + auto *Mask = Operands[2]; + auto *Passthru = Operands[3]; + Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, VTy, DL); + + SmallVector NewElements; + for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) { + auto *MaskElt = + dyn_cast_or_null(Mask->getAggregateElement(I)); + if (!MaskElt) + break; + if (MaskElt->isZero()) { + auto *PassthruElt = Passthru->getAggregateElement(I); + if (!PassthruElt) + break; + NewElements.push_back(PassthruElt); + } else { + assert(MaskElt->isOne()); + auto *VecElt = VecData->getAggregateElement(I); + if (!VecElt) + break; + NewElements.push_back(VecElt); + } + } + if (NewElements.size() == VTy->getNumElements()) + return ConstantVector::get(NewElements); + } + for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) { // Gather a column of constants. for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) { @@ -1870,7 +1911,8 @@ Type *Ty = F->getReturnType(); if (auto *VTy = dyn_cast(Ty)) - return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands, TLI); + return ConstantFoldVectorCall(Name, F->getIntrinsicID(), VTy, Operands, + F->getParent()->getDataLayout(), TLI); return ConstantFoldScalarCall(Name, F->getIntrinsicID(), Ty, Operands, TLI); } Index: lib/Analysis/InstructionSimplify.cpp =================================================================== --- lib/Analysis/InstructionSimplify.cpp +++ lib/Analysis/InstructionSimplify.cpp @@ -3991,6 +3991,15 @@ Q.DL); } + // Simplify calls to llvm.masked.load.* + if (IID == Intrinsic::masked_load) { + IterTy MaskArg = ArgBegin + 2; + // If the mask is all zeros, the "passthru" argument is the result. + if (auto *ConstMask = dyn_cast(*MaskArg)) + if (ConstMask->isNullValue()) + return ArgBegin[3]; + } + // Perform idempotent optimizations if (!IsIdempotent(IID)) return nullptr; Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1044,10 +1044,6 @@ if (!ConstMask) return nullptr; - // If the mask is all zeros, the "passthru" argument is the result. - if (ConstMask->isNullValue()) - return II.getArgOperand(3); - // If the mask is all ones, this is a plain vector load of the 1st argument. if (ConstMask->isAllOnesValue()) { Value *LoadPtr = II.getArgOperand(0); Index: test/Transforms/InstSimplify/call.ll =================================================================== --- test/Transforms/InstSimplify/call.ll +++ test/Transforms/InstSimplify/call.ll @@ -204,4 +204,15 @@ ; CHECK-LABEL: define i32 @call_undef( ; CHECK: ret i32 undef +@GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49] + +define <8 x i32> @partial_masked_load() { +; CHECK-LABEL: @partial_masked_load( +; CHECK: ret <8 x i32> + %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -2) to <8 x i32>*), i32 4, <8 x i1> , <8 x i32> undef) + ret <8 x i32> %masked.load +} + declare noalias i8* @malloc(i64) + +declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) Index: test/Transforms/InstSimplify/load.ll =================================================================== --- test/Transforms/InstSimplify/load.ll +++ test/Transforms/InstSimplify/load.ll @@ -20,3 +20,11 @@ ret i32 %load } +@GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49] + +define <8 x i32> @partial_load() { +; CHECK-LABEL: @partial_load( +; CHECK: ret <8 x i32> + %load = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -1) to <8 x i32>*) + ret <8 x i32> %load +}