Index: lib/CodeGen/CodeGenPrepare.cpp =================================================================== --- lib/CodeGen/CodeGenPrepare.cpp +++ lib/CodeGen/CodeGenPrepare.cpp @@ -152,11 +152,11 @@ bool EliminateMostlyEmptyBlocks(Function &F); bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void EliminateMostlyEmptyBlock(BasicBlock *BB); - bool OptimizeBlock(BasicBlock &BB); - bool OptimizeInst(Instruction *I); + bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT); + bool OptimizeInst(Instruction *I, bool& ModifiedDT); bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy); bool OptimizeInlineAsmInst(CallInst *CS); - bool OptimizeCallInst(CallInst *CI); + bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT); bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); bool OptimizeSelectInst(SelectInst *SI); @@ -226,7 +226,13 @@ MadeChange = false; for (Function::iterator I = F.begin(); I != F.end(); ) { BasicBlock *BB = I++; - MadeChange |= OptimizeBlock(*BB); + bool ModifiedDTOnIteration = false; + MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + ModifiedDT |= ModifiedDTOnIteration; + if (ModifiedDTOnIteration) + break; } EverMadeChange |= MadeChange; } @@ -838,7 +844,211 @@ }; } // end anonymous namespace -bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { +// ScalarizeMaskedLoad() translates masked load intrinsic, like +// <16 x i32 > @llvm.masked.load(i8* %addr, <16 x i32> %passthru, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, whith loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +//cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +//else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +//cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +//else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void ScalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Src0 = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(3); + VectorType *VecType = dyn_cast(CI->getType()); + Type *EltTy = VecType->getElementType(); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + unsigned VectorWidth = VecType->getNumElements(); + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx)); + LoadInst* Load = Builder.CreateLoad(Gep, false); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// ScalarizeMaskedStore() translates masked store intrinsic, like +// void @llvm.masked.store(i8* %addr, <16 x i32> %src, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void ScalarizeMaskedStore(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Src = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast(Src->getType()); + Type *EltTy = VecType->getElementType(); + + assert(VecType && "Unexpected data type in masked store intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + + unsigned VectorWidth = VecType->getNumElements(); + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateStore(OneElt, Gep); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) { BasicBlock *BB = CI->getParent(); // Lower inline assembly if we can. @@ -858,38 +1068,60 @@ return true; } - // Lower all uses of llvm.objectsize.* IntrinsicInst *II = dyn_cast(CI); - if (II && II->getIntrinsicID() == Intrinsic::objectsize) { - bool Min = (cast(II->getArgOperand(1))->getZExtValue() == 1); - Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - - // Substituting this can cause recursive simplifications, which can - // invalidate our iterator. Use a WeakVH to hold onto it in case this - // happens. - WeakVH IterHandle(CurInstIterator); - - replaceAndRecursivelySimplify(CI, RetVal, - TLI ? TLI->getDataLayout() : nullptr, - TLInfo, ModifiedDT ? nullptr : DT); + if (II) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::objectsize: { + // Lower all uses of llvm.objectsize.* + bool Min = (cast(II->getArgOperand(1))->getZExtValue() == 1); + Type *ReturnTy = CI->getType(); + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(CurInstIterator); + + replaceAndRecursivelySimplify(CI, RetVal, + TLI ? TLI->getDataLayout() : nullptr, + TLInfo, ModifiedDT ? nullptr : DT); - // If the iterator instruction was recursively deleted, start over at the - // start of the block. - if (IterHandle != CurInstIterator) { - CurInstIterator = BB->begin(); - SunkAddrs.clear(); + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } + return true; + } + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalPredicatedLoad(CI->getType(), 1)) { + ScalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalPredicatedStore(CI->getArgOperand(1)->getType(), 1)) { + ScalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } } - return true; - } - if (II && TLI) { - SmallVector PtrOps; - Type *AccessTy; - if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) - while (!PtrOps.empty()) - if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) - return true; + if (TLI) { + SmallVector PtrOps; + Type *AccessTy; + if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy)) + while (!PtrOps.empty()) + if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy)) + return true; + } } // From here on out we're working with named functions. @@ -3577,7 +3809,7 @@ return false; } -bool CodeGenPrepare::OptimizeInst(Instruction *I) { +bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) { if (PHINode *P = dyn_cast(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) // to introduce PHI nodes too late to be cleaned up. If we detect such a @@ -3656,14 +3888,14 @@ GEPI->replaceAllUsesWith(NC); GEPI->eraseFromParent(); ++NumGEPsElim; - OptimizeInst(NC); + OptimizeInst(NC, ModifiedDT); return true; } return false; } if (CallInst *CI = dyn_cast(I)) - return OptimizeCallInst(CI); + return OptimizeCallInst(CI, ModifiedDT); if (SelectInst *SI = dyn_cast(I)) return OptimizeSelectInst(SI); @@ -3680,14 +3912,16 @@ // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. -bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { +bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) { SunkAddrs.clear(); bool MadeChange = false; CurInstIterator = BB.begin(); - while (CurInstIterator != BB.end()) - MadeChange |= OptimizeInst(CurInstIterator++); - + while (CurInstIterator != BB.end()) { + MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT); + if (ModifiedDT) + return true; + } MadeChange |= DupRetToEnableTailCallOpts(&BB); return MadeChange; Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8931,6 +8931,7 @@ int_x86_avx2_maskstore_q, int_x86_avx2_maskstore_q_256>, VEX_W; +let Predicates = [HasAVX2] in { def: Pat<(masked_store addr:$ptr, (v8i32 VR256:$mask), (v8f32 VR256:$src)), (VPMASKMOVDYmr addr:$ptr, VR256:$mask, VR256:$src)>; @@ -8985,7 +8986,7 @@ def: Pat<(v4i64 (masked_load addr:$ptr, (v4i64 VR256:$mask), (v4i64 VR256:$src0))), (VBLENDVPDYrr VR256:$src0, (VPMASKMOVQYrm VR256:$mask, addr:$ptr), VR256:$mask)>; - +} //===----------------------------------------------------------------------===// // Variable Bit Shifts Index: test/CodeGen/X86/masked_memop.ll =================================================================== --- test/CodeGen/X86/masked_memop.ll +++ test/CodeGen/X86/masked_memop.ll @@ -1,5 +1,6 @@ ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=knl < %s | FileCheck %s -check-prefix=AVX512 ; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2 +; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=AVX_SCALAR ; AVX512-LABEL: test1 ; AVX512: vmovdqu32 (%rdi), %zmm0 {%k1} {z} @@ -9,6 +10,12 @@ ; AVX2: vpmaskmovd (%rdi) ; AVX2-NOT: blend +; AVX_SCALAR-LABEL: test1 +; AVX_SCALAR-NOT: masked +; AVX_SCALAR: extractelement +; AVX_SCALAR: insertelement +; AVX_SCALAR: extractelement +; AVX_SCALAR: insertelement define <16 x i32> @test1(<16 x i32> %trigger, i8* %addr) { %mask = icmp eq <16 x i32> %trigger, zeroinitializer %res = call <16 x i32> @llvm.masked.load.v16i32(i8* %addr, <16 x i32>undef, i32 4, <16 x i1>%mask) @@ -31,6 +38,14 @@ ; AVX512-LABEL: test3 ; AVX512: vmovdqu32 %zmm1, (%rdi) {%k1} +; AVX_SCALAR-LABEL: test3 +; AVX_SCALAR-NOT: masked +; AVX_SCALAR: extractelement +; AVX_SCALAR: store +; AVX_SCALAR: extractelement +; AVX_SCALAR: store +; AVX_SCALAR: extractelement +; AVX_SCALAR: store define void @test3(<16 x i32> %trigger, i8* %addr, <16 x i32> %val) { %mask = icmp eq <16 x i32> %trigger, zeroinitializer call void @llvm.masked.store.v16i32(i8* %addr, <16 x i32>%val, i32 4, <16 x i1>%mask)