diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -229,6 +229,18 @@ DstTy.getSimpleVT(), SrcTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } + + static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { + // FPExtends are similar but also require the VCVT instructions. + {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, + {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3}, + }; + if (SrcTy.isVector() && ST->hasMVEFloatOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, + DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } } // The truncate of a store is free. This is the mirror of extends above. @@ -247,6 +259,17 @@ DstTy.getSimpleVT())) return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); } + + static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = { + {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1}, + {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3}, + }; + if (SrcTy.isVector() && ST->hasMVEFloatOps()) { + if (const auto *Entry = + ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(), + DstTy.getSimpleVT())) + return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor()); + } } // NEON vector operations that can extend their inputs. @@ -955,6 +978,22 @@ return LT.first * 4; } + // MVE can optimize a fpext(load(4xhalf)) using an extending integer load. + // Same for stores. + if (ST->hasMVEFloatOps() && isa(Src) && I && + ((Opcode == Instruction::Load && I->hasOneUse() && + isa(*I->user_begin())) || + (Opcode == Instruction::Store && isa(I->getOperand(0))))) { + FixedVectorType *SrcVTy = cast(Src); + Type *DstTy = + Opcode == Instruction::Load + ? (*I->user_begin())->getType() + : cast(I->getOperand(0))->getOperand(0)->getType(); + if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() && + DstTy->getScalarType()->isFloatTy()) + return ST->getMVEVectorCostFactor(); + } + int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy() ? ST->getMVEVectorCostFactor() : 1; diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll --- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll +++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll @@ -1284,8 +1284,8 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double> @@ -1294,8 +1294,8 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'load_fpextends' @@ -1396,8 +1396,8 @@ ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r2 = fpext half %loadf16 to double ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r3 = fpext float %loadf32 to double ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double> @@ -1407,7 +1407,7 @@ ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-SIZE-LABEL: 'load_fpextends' @@ -1558,9 +1558,9 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float> @@ -1569,7 +1569,7 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %i3264, float* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21632, <2 x half>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21664, <2 x half>* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41664, <4 x half>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81632, <8 x half>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81664, <8 x half>* undef, align 16 @@ -1658,9 +1658,9 @@ ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %i3264 = fptrunc double undef to float ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half> -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float> @@ -2784,8 +2784,8 @@ ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v9 = fpext <2 x float> %loadv2f32 to <2 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_fpextends' @@ -2877,7 +2877,7 @@ ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v10 = fpext <4 x float> %loadv4f32 to <4 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v11 = fpext <8 x float> %loadv8f32 to <8 x double> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f16ou = load <4 x half>, <4 x half>* undef, align 8 -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2ou = fpext <4 x half> %loadv4f16ou to <4 x float> ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_fpextends' diff --git a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll --- a/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll @@ -562,14 +562,12 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; TODO: this fpext could be allowed, but we don't lower it very efficiently yet, -; so reject this for now. define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: fpext_allowed( -; PREFER-FOLDING-NOT: vector.body: +; PREFER-FOLDING: vector.body: ; PREFER-FOLDING-NOT: llvm.masked.load ; PREFER-FOLDING-NOT: llvm.masked.store -; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body @@ -591,14 +589,12 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } -; TODO: this fptrunc could be allowed, but we don't lower it very efficiently yet, -; so reject this for now. define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { ; CHECK-LABEL: fptrunc_allowed( -; PREFER-FOLDING-NOT: vector.body: +; PREFER-FOLDING: vector.body: ; PREFER-FOLDING-NOT: llvm.masked.load ; PREFER-FOLDING-NOT: llvm.masked.store -; PREFER-FOLDING-NOT: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body entry: br label %for.body