Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1420,7 +1420,22 @@ /// Returns true if \p I is an instruction that will be scalarized with /// predication. Such instructions include conditional stores and /// instructions that may divide by zero. - bool isScalarWithPredication(Instruction *I); + /// If a non-zero VF has been calculated, we check if I will be scalarized + /// predication for that VF. + bool isScalarWithPredication(Instruction *I, unsigned VF = 1); + + // Returns true if \p I is an instruction that will be predicated either + // through scalar predication or masked load/store or masked gather/scatter. + // Superset of instructions that return true for isScalarWithPredication. + bool isPredicatedInst(Instruction *I) { + if (!Legal->blockNeedsPredication(I->getParent())) + return false; + // Loads and stores that need some form of masked operation are predicated + // instructions. + if (isa(I) || isa(I)) + return Legal->isMaskRequired(I); + return isScalarWithPredication(I); + } /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. @@ -4367,7 +4382,7 @@ Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) { if (!Legal->blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { @@ -4379,6 +4394,14 @@ return false; auto *Ptr = getLoadStorePointerOperand(I); auto *Ty = getMemInstValueType(I); + // We have already decided how to vectorize this instruction, get that + // result. + if (VF > 1) { + InstWidening WideningDecision = getWideningDecision(I, VF); + assert(WideningDecision != CM_Unknown && + "Widening decision should be ready at this moment"); + return WideningDecision == CM_Scalarize; + } return isa(I) ? !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); @@ -5472,8 +5495,7 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert(isScalarWithPredication(I) && - "Expecting a scalar emulated instruction"); + assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); @@ -5734,7 +5756,7 @@ // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated // block. - if (isScalarWithPredication(I)) { + if (isPredicatedInst(I)) { Cost /= getReciprocalPredBlockProb(); if (useEmulatedMaskMemRefHack(I)) @@ -5880,7 +5902,12 @@ if (isa(&I) && isScalarWithPredication(&I)) NumPredStores++; - if (isa(&I) && Legal->isUniform(Ptr)) { + + if (isa(&I) && Legal->isUniform(Ptr) && + // Conditional loads should be scalarized and predicated. + // isScalarWithPredication cannot be used here since masked + // gather/scatters are not considered scalar with predication. + !Legal->blockNeedsPredication(I.getParent())) { // Scalar load + broadcast unsigned Cost = getUniformMemOpCost(&I, VF); setWideningDecision(&I, VF, CM_Scalarize, Cost); @@ -6720,7 +6747,11 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range) { - if (CM.isScalarWithPredication(I)) + + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + + if (IsPredicated) return false; auto IsVectorizableOpcode = [](unsigned Opcode) { @@ -6827,7 +6858,9 @@ [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = CM.isScalarWithPredication(I); + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); // Find if I uses a predicated instruction. If so, it will use its scalar Index: llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -1,5 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 +; RUN: opt < %s -O3 -mcpu=knl -force-vector-width=2 -S | FileCheck %s -check-prefix=FVW2 + +; With a force-vector-width, it is sometimes more profitable to generate +; scalarized and predicated stores instead of masked scatter. target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc_linux" @@ -87,6 +91,73 @@ ; AVX512: for.end: ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo1( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 +; FVW2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 +; FVW2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4 +; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 +; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4 +; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer +; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer +; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD11]], zeroinitializer +; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD12]], zeroinitializer +; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> undef) +; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2 +; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> undef) +; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 +; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> undef) +; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6 +; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>* +; FVW2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> undef) +; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64> +; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64> +; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD14]] to <2 x i64> +; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD15]] to <2 x i64> +; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]] +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]] +; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]] +; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER17:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER18:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef) +; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], +; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER17]], +; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER18]], +; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]]) +; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2 +; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]]) +; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 4 +; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]]) +; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6 +; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>* +; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]]) +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca float*, align 8 %out.addr = alloca float*, align 8 @@ -290,6 +361,118 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !2 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca float*, align 8 @@ -494,6 +677,118 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo3( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE29:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE29]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD6]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER9]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD6]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER12]], +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], i64 [[OFFSET_IDX]], i32 1 +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]] +; FVW2: pred.store.if16: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP20]], i32 1 +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE17]] +; FVW2: pred.store.continue17: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]] +; FVW2: pred.store.if18: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP24]], i32 1 +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE19]] +; FVW2: pred.store.continue19: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]] +; FVW2: pred.store.if20: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP28]], i32 1 +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE21]] +; FVW2: pred.store.continue21: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]] +; FVW2: pred.store.if22: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP32]], i32 1 +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE23]] +; FVW2: pred.store.continue23: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]] +; FVW2: pred.store.if24: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP36]], i32 1 +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE25]] +; FVW2: pred.store.continue25: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]] +; FVW2: pred.store.if26: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP40]], i32 1 +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE27]] +; FVW2: pred.store.continue27: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.if28: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[TMP44]], i32 1 +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE29]] +; FVW2: pred.store.continue29: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !3 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca %struct.Out*, align 8 @@ -684,6 +979,118 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !4 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 @@ -874,6 +1281,118 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace2( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p1f32(<2 x float addrspace(1)*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(0)*, align 8 @@ -1064,6 +1583,118 @@ ; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) ; AVX512-NEXT: ret void ; +; FVW2-LABEL: @foo2_addrspace3( +; FVW2-NEXT: entry: +; FVW2-NEXT: br label [[VECTOR_BODY:%.*]] +; FVW2: vector.body: +; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ] +; FVW2-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE30]] ] +; FVW2-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX6]], 4 +; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <2 x i64> [[VEC_IND]] +; FVW2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD]] +; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD7]] +; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <2 x i64> [[STEP_ADD8]] +; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP0]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER10:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP1]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER11:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP2]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER12:%.*]] = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> [[TMP3]], i32 4, <2 x i1> , <2 x i32> undef) +; FVW2-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; FVW2-NEXT: [[TMP5:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER10]], zeroinitializer +; FVW2-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER11]], zeroinitializer +; FVW2-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i32> [[WIDE_MASKED_GATHER12]], zeroinitializer +; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <2 x i64> [[VEC_IND]], i32 1 +; FVW2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD]], i32 1 +; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD7]], i32 1 +; FVW2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <2 x i64> [[STEP_ADD8]], i32 1 +; FVW2-NEXT: [[WIDE_MASKED_GATHER13:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP8]], i32 4, <2 x i1> [[TMP4]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER14:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP9]], i32 4, <2 x i1> [[TMP5]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER15:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP10]], i32 4, <2 x i1> [[TMP6]], <2 x float> undef) +; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP11]], i32 4, <2 x i1> [[TMP7]], <2 x float> undef) +; FVW2-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER13]], +; FVW2-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER14]], +; FVW2-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER15]], +; FVW2-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], +; FVW2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; FVW2-NEXT: br i1 [[TMP16]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; FVW2: pred.store.if: +; FVW2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], i64 [[OFFSET_IDX]] +; FVW2-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP12]], i32 0 +; FVW2-NEXT: store float [[TMP18]], float addrspace(1)* [[TMP17]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE]] +; FVW2: pred.store.continue: +; FVW2-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; FVW2-NEXT: br i1 [[TMP19]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]] +; FVW2: pred.store.if17: +; FVW2-NEXT: [[TMP20:%.*]] = or i64 [[OFFSET_IDX]], 16 +; FVW2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP20]] +; FVW2-NEXT: [[TMP22:%.*]] = extractelement <2 x float> [[TMP12]], i32 1 +; FVW2-NEXT: store float [[TMP22]], float addrspace(1)* [[TMP21]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE18]] +; FVW2: pred.store.continue18: +; FVW2-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; FVW2-NEXT: br i1 [[TMP23]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]] +; FVW2: pred.store.if19: +; FVW2-NEXT: [[TMP24:%.*]] = or i64 [[OFFSET_IDX]], 32 +; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP24]] +; FVW2-NEXT: [[TMP26:%.*]] = extractelement <2 x float> [[TMP13]], i32 0 +; FVW2-NEXT: store float [[TMP26]], float addrspace(1)* [[TMP25]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE20]] +; FVW2: pred.store.continue20: +; FVW2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; FVW2-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22:%.*]] +; FVW2: pred.store.if21: +; FVW2-NEXT: [[TMP28:%.*]] = or i64 [[OFFSET_IDX]], 48 +; FVW2-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP28]] +; FVW2-NEXT: [[TMP30:%.*]] = extractelement <2 x float> [[TMP13]], i32 1 +; FVW2-NEXT: store float [[TMP30]], float addrspace(1)* [[TMP29]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE22]] +; FVW2: pred.store.continue22: +; FVW2-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; FVW2-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]] +; FVW2: pred.store.if23: +; FVW2-NEXT: [[TMP32:%.*]] = or i64 [[OFFSET_IDX]], 64 +; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP32]] +; FVW2-NEXT: [[TMP34:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 +; FVW2-NEXT: store float [[TMP34]], float addrspace(1)* [[TMP33]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE24]] +; FVW2: pred.store.continue24: +; FVW2-NEXT: [[TMP35:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; FVW2-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]] +; FVW2: pred.store.if25: +; FVW2-NEXT: [[TMP36:%.*]] = or i64 [[OFFSET_IDX]], 80 +; FVW2-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP36]] +; FVW2-NEXT: [[TMP38:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; FVW2-NEXT: store float [[TMP38]], float addrspace(1)* [[TMP37]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE26]] +; FVW2: pred.store.continue26: +; FVW2-NEXT: [[TMP39:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; FVW2-NEXT: br i1 [[TMP39]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]] +; FVW2: pred.store.if27: +; FVW2-NEXT: [[TMP40:%.*]] = or i64 [[OFFSET_IDX]], 96 +; FVW2-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP40]] +; FVW2-NEXT: [[TMP42:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; FVW2-NEXT: store float [[TMP42]], float addrspace(1)* [[TMP41]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE28]] +; FVW2: pred.store.continue28: +; FVW2-NEXT: [[TMP43:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; FVW2-NEXT: br i1 [[TMP43]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.if29: +; FVW2-NEXT: [[TMP44:%.*]] = or i64 [[OFFSET_IDX]], 112 +; FVW2-NEXT: [[TMP45:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[TMP44]] +; FVW2-NEXT: [[TMP46:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; FVW2-NEXT: store float [[TMP46]], float addrspace(1)* [[TMP45]], align 4 +; FVW2-NEXT: br label [[PRED_STORE_CONTINUE30]] +; FVW2: pred.store.continue30: +; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8 +; FVW2-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], +; FVW2-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; FVW2-NEXT: br i1 [[TMP47]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !6 +; FVW2: for.end: +; FVW2-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(0)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 Index: llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512 -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { +; CHECK-LABEL: @inv_load_conditional( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[NTRUNC:%.*]] = trunc i64 [[N:%.*]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX:%.*]] = select i1 [[TMP0]], i64 [[N]], i64 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 16 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[A4:%.*]] = bitcast i32* [[A:%.*]] to i8* +; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i64 [[N]], 1 +; CHECK-NEXT: [[SMAX2:%.*]] = select i1 [[TMP1]], i64 [[N]], i64 1 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[SMAX2]] +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, i8* [[A4]], i64 1 +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i8* [[UGLYGEP]], [[B1]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i32*> undef, i32* [[A]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i32*> [[BROADCAST_SPLATINSERT5]], <16 x i32*> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <16 x i32> undef, i32 [[NTRUNC]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT7]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT6]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT8]], <16 x i32>* [[TMP4]], align 4, !alias.scope !0, !noalias !3 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK: middle.block: +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT6]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3 +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[PREDPHI]], i32 15 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]] +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32* [[A]], null +; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[TMP1]], align 4 +; CHECK-NEXT: br i1 [[CMP]], label [[LATCH]], label [[COND_LOAD:%.*]] +; CHECK: cond_load: +; CHECK-NEXT: [[ALOAD:%.*]] = load i32, i32* [[A]], align 4 +; CHECK-NEXT: br label [[LATCH]] +; CHECK: latch: +; CHECK-NEXT: [[A_LCSSA:%.*]] = phi i32 [ [[ALOAD]], [[COND_LOAD]] ], [ 1, [[FOR_BODY]] ] +; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !7 +; CHECK: for.end: +; CHECK-NEXT: [[A_LCSSA_LCSSA:%.*]] = phi i32 [ [[A_LCSSA]], [[LATCH]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[A_LCSSA_LCSSA]] +; +entry: + %ntrunc = trunc i64 %n to i32 + br label %for.body + +for.body: ; preds = %for.body, %entry + %i = phi i64 [ %i.next, %latch ], [ 0, %entry ] + %tmp1 = getelementptr inbounds i32, i32* %b, i64 %i + %tmp2 = load i32, i32* %tmp1, align 8 + %cmp = icmp ne i32* %a, null + store i32 %ntrunc, i32* %tmp1 + br i1 %cmp, label %cond_load, label %latch + +cond_load: + %aload = load i32, i32* %a, align 4 + br label %latch + +latch: + %a.lcssa = phi i32 [ %aload, %cond_load ], [ 1, %for.body ] + %i.next = add nuw nsw i64 %i, 1 + %cond = icmp slt i64 %i.next, %n + br i1 %cond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret i32 %a.lcssa +}