Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -499,17 +499,21 @@ /// modes that operate across loop iterations. bool shouldFavorBackedgeIndex(const Loop *L) const; - /// Return true if the target supports masked load/store - /// AVX2 and AVX-512 targets allow masks for consecutive load and store + /// Return true if the target supports masked load. bool isLegalMaskedStore(Type *DataType) const; + /// Return true if the target supports masked store. bool isLegalMaskedLoad(Type *DataType) const; - /// Return true if the target supports masked gather/scatter - /// AVX-512 fully supports gather and scatter for vectors with 32 and 64 - /// bits scalar type. + /// Return true if the target supports masked scatter. bool isLegalMaskedScatter(Type *DataType) const; + /// Return true if the target supports masked gather. bool isLegalMaskedGather(Type *DataType) const; + /// Return true if the target supports masked compress store. + bool isLegalMaskedCompressStore(Type *DataType) const; + /// Return true if the target supports masked expand load. + bool isLegalMaskedExpandLoad(Type *DataType) const; + /// Return true if the target has a unified operation to calculate division /// and remainder. If so, the additional implicit multiplication and /// subtraction required to calculate a remainder from division are free. This @@ -1085,6 +1089,8 @@ virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; virtual bool isLegalMaskedGather(Type *DataType) = 0; + virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; + virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0; virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0; virtual bool prefersVectorizedAddressing() = 0; @@ -1336,6 +1342,12 @@ bool isLegalMaskedGather(Type *DataType) override { return Impl.isLegalMaskedGather(DataType); } + bool isLegalMaskedCompressStore(Type *DataType) override { + return Impl.isLegalMaskedCompressStore(DataType); + } + bool isLegalMaskedExpandLoad(Type *DataType) override { + return Impl.isLegalMaskedExpandLoad(DataType); + } bool hasDivRemOp(Type *DataType, bool IsSigned) override { return Impl.hasDivRemOp(DataType, IsSigned); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -270,6 +270,10 @@ bool isLegalMaskedGather(Type *DataType) { return false; } + bool isLegalMaskedCompressStore(Type *DataType) { return false; } + + bool isLegalMaskedExpandLoad(Type *DataType) { return false; } + bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; } bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -185,6 +185,14 @@ return TTIImpl->isLegalMaskedScatter(DataType); } +bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const { + return TTIImpl->isLegalMaskedCompressStore(DataType); +} + +bool TargetTransformInfo::isLegalMaskedExpandLoad(Type *DataType) const { + return TTIImpl->isLegalMaskedExpandLoad(DataType); +} + bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const { return TTIImpl->hasDivRemOp(DataType, IsSigned); } Index: lib/CodeGen/ScalarizeMaskedMemIntrin.cpp =================================================================== --- lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -534,6 +534,154 @@ ModifiedDT = true; } +static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) { + Value *Ptr = CI->getArgOperand(0); + Value *Mask = CI->getArgOperand(1); + Value *PassThru = CI->getArgOperand(2); + + VectorType *VecType = cast(CI->getType()); + + Type *EltTy = VecType->getElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned VectorWidth = VecType->getNumElements(); + + // The result vector + Value *VResult = PassThru; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // br i1 %mask_1, label %cond.load, label %else + // + + Value *Predicate = + Builder.CreateExtractElement(Mask, Idx); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), + "cond.load"); + Builder.SetInsertPoint(InsertPt); + + LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, 1); + Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx); + + // Move the pointer if there are more blocks to come. + Value *NewPtr; + if ((Idx + 1) != VectorWidth) + NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr); + OldBr->eraseFromParent(); + BasicBlock *PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + + // Create the phi to join the new and previous value. + PHINode *ResultPhi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + ResultPhi->addIncoming(NewVResult, CondBlock); + ResultPhi->addIncoming(VResult, PrevIfBlock); + VResult = ResultPhi; + + // Add a PHI for the pointer if this isn't the last iteration. + if ((Idx + 1) != VectorWidth) { + PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else"); + PtrPhi->addIncoming(NewPtr, CondBlock); + PtrPhi->addIncoming(Ptr, PrevIfBlock); + Ptr = PtrPhi; + } + } + + CI->replaceAllUsesWith(VResult); + CI->eraseFromParent(); + + ModifiedDT = true; +} + +static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) { + Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + + VectorType *VecType = cast(Src->getType()); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Type *EltTy = VecType->getVectorElementType(); + + unsigned VectorWidth = VecType->getNumElements(); + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // br i1 %mask_1, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Idx); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Idx); + Builder.CreateAlignedStore(OneElt, Ptr, 1); + + // Move the pointer if there are more blocks to come. + Value *NewPtr; + if ((Idx + 1) != VectorWidth) + NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, 1); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Predicate, OldBr); + OldBr->eraseFromParent(); + BasicBlock *PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + + // Add a PHI for the pointer if this isn't the last iteration. + if ((Idx + 1) != VectorWidth) { + PHINode *PtrPhi = Builder.CreatePHI(Ptr->getType(), 2, "ptr.phi.else"); + PtrPhi->addIncoming(NewPtr, CondBlock); + PtrPhi->addIncoming(Ptr, PrevIfBlock); + Ptr = PtrPhi; + } + } + CI->eraseFromParent(); + + ModifiedDT = true; +} + bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) { bool EverMadeChange = false; @@ -600,6 +748,16 @@ return false; scalarizeMaskedScatter(CI, ModifiedDT); return true; + case Intrinsic::masked_expandload: + if (TTI->isLegalMaskedExpandLoad(CI->getType())) + return false; + scalarizeMaskedExpandLoad(CI, ModifiedDT); + return true; + case Intrinsic::masked_compressstore: + if (TTI->isLegalMaskedCompressStore(CI->getArgOperand(0)->getType())) + return false; + scalarizeMaskedCompressStore(CI, ModifiedDT); + return true; } } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -184,6 +184,8 @@ bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); + bool isLegalMaskedExpandLoad(Type *DataType); + bool isLegalMaskedCompressStore(Type *DataType); bool hasDivRemOp(Type *DataType, bool IsSigned); bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -3014,6 +3014,34 @@ return isLegalMaskedLoad(DataType); } +bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) { + if (!isa(DataTy)) + return false; + + if (!ST->hasAVX512()) + return false; + + // The backend can't handle a single element vector. + if (DataTy->getVectorNumElements() == 1) + return false; + + Type *ScalarTy = DataTy->getVectorElementType(); + + if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy()) + return true; + + if (!ScalarTy->isIntegerTy()) + return false; + + unsigned IntWidth = ScalarTy->getIntegerBitWidth(); + return IntWidth == 32 || IntWidth == 64 || + ((IntWidth == 8 || IntWidth == 16) && ST->hasVBMI2()); +} + +bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) { + return isLegalMaskedExpandLoad(DataTy); +} + bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { // Some CPUs have better gather performance than others. // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only Index: test/CodeGen/X86/pr39666.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr39666.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s + +define <2 x i64> @test5(i64* %base, <2 x i64> %src0) { +; CHECK-LABEL: test5: +; CHECK: # %bb.0: # %else +; CHECK-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.masked.expandload.v2i64(i64* %base, <2 x i1> , <2 x i64> %src0) + ret <2 x i64>%res +} +declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) + +define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) { +; CHECK-LABEL: test11: +; CHECK: # %bb.0: +; CHECK-NEXT: vpextrb $0, %xmm1, %eax +; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: je .LBB1_2 +; CHECK-NEXT: # %bb.1: # %cond.store +; CHECK-NEXT: vmovq %xmm0, (%rdi) +; CHECK-NEXT: addq $8, %rdi +; CHECK-NEXT: .LBB1_2: # %else +; CHECK-NEXT: vpextrb $8, %xmm1, %eax +; CHECK-NEXT: testb $1, %al +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT: # %bb.3: # %cond.store1 +; CHECK-NEXT: vpextrq $1, %xmm0, (%rdi) +; CHECK-NEXT: .LBB1_4: # %else2 +; CHECK-NEXT: retq + call void @llvm.masked.compressstore.v2i64(<2 x i64> %V, i64* %base, <2 x i1> %mask) + ret void +} +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64* , <2 x i1>) Index: test/CodeGen/X86/pr40994.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/pr40994.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.2 | FileCheck %s + +define <8 x i8> @foo(<16 x i8> %a) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: # %cond.store +; CHECK-NEXT: pextrb $0, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $2, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $4, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $6, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $8, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $10, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $12, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pextrb $14, %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; CHECK-NEXT: retq + %v = alloca i8, i32 8, align 16 + call void @llvm.masked.compressstore.v16i8(<16 x i8> %a, i8* %v, <16 x i1> ) + %ptr = bitcast i8* %v to <8 x i8>* + %out = load <8 x i8>, <8 x i8>* %ptr + ret <8 x i8> %out +} +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>) #0 Index: test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll =================================================================== --- /dev/null +++ test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-compressstore.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s + +define void @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %data) { +; CHECK-LABEL: @scalarize_v2i64( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK: cond.store: +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP4]], label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.store1: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 +; CHECK-NEXT: store i64 [[TMP5]], i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: ret void +; + call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> %mask) + ret void +} + +define void @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %data) { +; CHECK-LABEL: @scalarize_v2i64_ones_mask( +; CHECK-NEXT: br i1 true, label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK: cond.store: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 +; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.store1: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: ret void +; + call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> ) + ret void +} + +define void @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %data) { +; CHECK-LABEL: @scalarize_v2i64_zero_mask( +; CHECK-NEXT: br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK: cond.store: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 +; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 false, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.store1: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: ret void +; + call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> ) + ret void +} + +define void @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %data) { +; CHECK-LABEL: @scalarize_v2i64_const_mask( +; CHECK-NEXT: br i1 false, label [[COND_STORE:%.*]], label [[ELSE:%.*]] +; CHECK: cond.store: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[DATA:%.*]], i64 0 +; CHECK-NEXT: store i64 [[TMP1]], i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP2]], [[COND_STORE]] ], [ [[P]], [[TMP0:%.*]] ] +; CHECK-NEXT: br i1 true, label [[COND_STORE1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.store1: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[DATA]], i64 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: ret void +; + call void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64> %data, i64* %p, <2 x i1> ) + ret void +} + +declare void @llvm.masked.compressstore.v2i64.p0v2i64(<2 x i64>, i64*, <2 x i1>) Index: test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll =================================================================== --- /dev/null +++ test/Transforms/ScalarizeMaskedMemIntrin/X86/expand-masked-expandload.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S %s -scalarize-masked-mem-intrin -mtriple=x86_64-linux-gnu | FileCheck %s + +define <2 x i64> @scalarize_v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru) { +; CHECK-LABEL: @scalarize_v2i64( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[MASK:%.*]], i64 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK: cond.load: +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP3]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP4]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[MASK]], i64 1 +; CHECK-NEXT: br i1 [[TMP5]], label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.load1: +; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP6]], i64 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP7]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] +; + %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> %mask, <2 x i64> %passthru) + ret <2 x i64> %ret +} + +define <2 x i64> @scalarize_v2i64_ones_mask(i64* %p, <2 x i64> %passthru) { +; CHECK-LABEL: @scalarize_v2i64_ones_mask( +; CHECK-NEXT: br i1 true, label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK: cond.load: +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ] +; CHECK-NEXT: br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.load1: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] +; + %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> , <2 x i64> %passthru) + ret <2 x i64> %ret +} + +define <2 x i64> @scalarize_v2i64_zero_mask(i64* %p, <2 x i64> %passthru) { +; CHECK-LABEL: @scalarize_v2i64_zero_mask( +; CHECK-NEXT: br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK: cond.load: +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ] +; CHECK-NEXT: br i1 false, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.load1: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] +; + %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> , <2 x i64> %passthru) + ret <2 x i64> %ret +} + +define <2 x i64> @scalarize_v2i64_const_mask(i64* %p, <2 x i64> %passthru) { +; CHECK-LABEL: @scalarize_v2i64_const_mask( +; CHECK-NEXT: br i1 false, label [[COND_LOAD:%.*]], label [[ELSE:%.*]] +; CHECK: cond.load: +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[P:%.*]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[PASSTHRU:%.*]], i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i32 1 +; CHECK-NEXT: br label [[ELSE]] +; CHECK: else: +; CHECK-NEXT: [[RES_PHI_ELSE:%.*]] = phi <2 x i64> [ [[TMP2]], [[COND_LOAD]] ], [ [[PASSTHRU]], [[TMP0:%.*]] ] +; CHECK-NEXT: [[PTR_PHI_ELSE:%.*]] = phi i64* [ [[TMP3]], [[COND_LOAD]] ], [ [[P]], [[TMP0]] ] +; CHECK-NEXT: br i1 true, label [[COND_LOAD1:%.*]], label [[ELSE2:%.*]] +; CHECK: cond.load1: +; CHECK-NEXT: [[TMP4:%.*]] = load i64, i64* [[PTR_PHI_ELSE]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[RES_PHI_ELSE]], i64 [[TMP4]], i64 1 +; CHECK-NEXT: br label [[ELSE2]] +; CHECK: else2: +; CHECK-NEXT: [[RES_PHI_ELSE3:%.*]] = phi <2 x i64> [ [[TMP5]], [[COND_LOAD1]] ], [ [[RES_PHI_ELSE]], [[ELSE]] ] +; CHECK-NEXT: ret <2 x i64> [[RES_PHI_ELSE3]] +; + %ret = call <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64* %p, <2 x i1> , <2 x i64> %passthru) + ret <2 x i64> %ret +} + +declare <2 x i64> @llvm.masked.expandload.v2i64.p0v2i64(i64*, <2 x i1>, <2 x i64>)