Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -103,7 +103,7 @@ #include "AArch64GenSubtargetInfo.inc" uint8_t MaxInterleaveFactor = 2; - uint8_t VectorInsertExtractBaseCost = 3; + uint8_t VectorInsertExtractBaseCost = 2; uint16_t CacheLineSize = 0; uint16_t PrefetchDistance = 0; uint16_t MinPrefetchStride = 1; Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -385,6 +385,11 @@ VectorType *SubTp, ArrayRef Args = std::nullopt); + InstructionCost getScalarizationOverhead(VectorType *Ty, + const APInt &DemandedElts, + bool Insert, bool Extract, + TTI::TargetCostKind CostKind); + /// Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2560,6 +2560,18 @@ return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */); } +InstructionCost AArch64TTIImpl::getScalarizationOverhead( + VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, + TTI::TargetCostKind CostKind) { + if (isa(Ty)) + return InstructionCost::getInvalid(); + if (Ty->getElementType()->isFloatingPointTy()) + return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, + CostKind); + return DemandedElts.popcount() * (Insert + Extract) * + ST->getVectorInsertExtractBaseCost(); +} + InstructionCost AArch64TTIImpl::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, Index: llvm/test/Analysis/CostModel/AArch64/arith-fp.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/arith-fp.ll +++ llvm/test/Analysis/CostModel/AArch64/arith-fp.ll @@ -198,16 +198,16 @@ define i32 @frem(i32 %arg) { ; CHECK-LABEL: 'frem' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4F16 = frem <4 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8F16 = frem <8 x half> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V16F16 = frem <16 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = frem <4 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V8F16 = frem <8 x half> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V16F16 = frem <16 x half> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = frem <2 x float> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4F32 = frem <4 x float> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8F32 = frem <8 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F32 = frem <2 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = frem <4 x float> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = frem <2 x double> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = frem <4 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V2F64 = frem <2 x double> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = frem <4 x double> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %F16 = frem half undef, undef Index: llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll +++ llvm/test/Analysis/CostModel/AArch64/arith-overflow.ll @@ -355,9 +355,9 @@ define i32 @smul(i32 %arg) { ; RECIP-LABEL: 'smul' ; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) @@ -437,9 +437,9 @@ define i32 @umul(i32 %arg) { ; RECIP-LABEL: 'umul' ; RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; RECIP-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) ; RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) Index: llvm/test/Analysis/CostModel/AArch64/bswap.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/bswap.ll +++ llvm/test/Analysis/CostModel/AArch64/bswap.ll @@ -44,7 +44,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v3i32 = call <3 x i32> @llvm.bswap.v3i32(<3 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i48 = call <4 x i48> @llvm.bswap.v4i48(<4 x i48> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v4i16 = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> undef) Index: llvm/test/Analysis/CostModel/AArch64/cast.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/cast.ll +++ llvm/test/Analysis/CostModel/AArch64/cast.ll @@ -947,8 +947,8 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> @@ -967,8 +967,8 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> -; CHECK-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r133 = fptosi <8 x float> undef to <8 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r134 = fptoui <8 x float> undef to <8 x i16> @@ -987,8 +987,8 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> -; CHECK-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> +; CHECK-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> ; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %r153 = fptosi <16 x float> undef to <16 x i8> ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r154 = fptoui <16 x float> undef to <16 x i16> @@ -1373,8 +1373,8 @@ ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> @@ -1586,8 +1586,8 @@ ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r113 = fptosi <4 x float> undef to <4 x i8> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r114 = fptoui <4 x float> undef to <4 x i16> @@ -3214,9 +3214,9 @@ define void @extend_extract() { ; CHECK-LABEL: 'extend_extract' -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32 @@ -3233,9 +3233,9 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; SVE-LABEL: 'extend_extract' -; SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e8 = extractelement <8 x i8> undef, i32 1 -; SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e16 = extractelement <8 x i16> undef, i32 1 -; SVE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %e32 = extractelement <8 x i32> undef, i32 1 +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e8 = extractelement <8 x i8> undef, i32 1 +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e16 = extractelement <8 x i16> undef, i32 1 +; SVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %e32 = extractelement <8 x i32> undef, i32 1 ; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_16 = sext i8 %e8 to i16 ; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %z8_16 = zext i8 %e8 to i16 ; SVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s8_32 = sext i8 %e8 to i32 @@ -3292,38 +3292,38 @@ ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r113 = fptosi <4 x half> undef to <4 x i8> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r114 = fptoui <4 x half> undef to <4 x i16> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> @@ -3438,8 +3438,8 @@ ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> @@ -3448,28 +3448,28 @@ ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r115 = fptosi <4 x half> undef to <4 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x half> undef to <4 x i32> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r135 = fptosi <8 x half> undef to <8 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x half> undef to <8 x i32> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x half> undef to <8 x i32> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r138 = fptoui <8 x half> undef to <8 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %r139 = fptosi <8 x half> undef to <8 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r150 = fptoui <16 x half> undef to <16 x i1> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %r151 = fptosi <16 x half> undef to <16 x i1> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r152 = fptoui <16 x half> undef to <16 x i8> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r153 = fptosi <16 x half> undef to <16 x i8> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r154 = fptoui <16 x half> undef to <16 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r155 = fptosi <16 x half> undef to <16 x i16> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x half> undef to <16 x i32> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x half> undef to <16 x i32> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r158 = fptoui <16 x half> undef to <16 x i64> +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %r159 = fptosi <16 x half> undef to <16 x i64> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r250 = uitofp <8 x i1> undef to <8 x half> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r251 = sitofp <8 x i1> undef to <8 x half> ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r252 = uitofp <8 x i8> undef to <8 x half> @@ -3511,8 +3511,8 @@ ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> @@ -3523,8 +3523,8 @@ ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> +; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> ; FIXED-MIN-256-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> @@ -3584,8 +3584,8 @@ ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r95 = fptosi <2 x half> undef to <2 x i16> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r96 = fptoui <2 x half> undef to <2 x i32> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r97 = fptosi <2 x half> undef to <2 x i32> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r98 = fptoui <2 x half> undef to <2 x i64> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r99 = fptosi <2 x half> undef to <2 x i64> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r110 = fptoui <4 x half> undef to <4 x i1> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r111 = fptosi <4 x half> undef to <4 x i1> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r112 = fptoui <4 x half> undef to <4 x i8> @@ -3596,8 +3596,8 @@ ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x half> undef to <4 x i32> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r118 = fptoui <4 x half> undef to <4 x i64> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r119 = fptosi <4 x half> undef to <4 x i64> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> -; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r130 = fptoui <8 x half> undef to <8 x i1> +; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r131 = fptosi <8 x half> undef to <8 x i1> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132 = fptoui <8 x half> undef to <8 x i8> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r133 = fptosi <8 x half> undef to <8 x i8> ; FIXED-MIN-2048-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r134 = fptoui <8 x half> undef to <8 x i16> Index: llvm/test/Analysis/CostModel/AArch64/cmp.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/cmp.ll +++ llvm/test/Analysis/CostModel/AArch64/cmp.ll @@ -17,7 +17,7 @@ ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cf16 = fcmp oge half undef, undef ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cf32 = fcmp ogt float undef, undef ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cf64 = fcmp ogt double undef, undef -; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef +; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cfv816 = fcmp olt <8 x half> undef, undef ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cfv432 = fcmp oge <4 x float> undef, undef ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cfv264 = fcmp oge <2 x double> undef, undef ; CHECK-THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void Index: llvm/test/Analysis/CostModel/AArch64/ctlz.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/ctlz.ll +++ llvm/test/Analysis/CostModel/AArch64/ctlz.ll @@ -55,7 +55,7 @@ define <2 x i64> @test_ctlz_v2i64(<2 x i64> %a) { ; ; CHECK-LABEL: 'test_ctlz_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctlz ; %ctlz = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %a, i1 false) @@ -154,7 +154,7 @@ define <4 x i64> @test_ctlz_v4i64(<4 x i64> %a) { ; CHECK-LABEL: 'test_ctlz_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctlz ; %ctlz = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %a, i1 false) Index: llvm/test/Analysis/CostModel/AArch64/cttz.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/cttz.ll +++ llvm/test/Analysis/CostModel/AArch64/cttz.ll @@ -55,7 +55,7 @@ define <2 x i64> @test_cttz_v2i64(<2 x i64> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %cttz ; %cttz = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %a, i1 true) @@ -65,7 +65,7 @@ define <2 x i32> @test_cttz_v2i32(<2 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %cttz ; %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %a, i1 true) @@ -75,7 +75,7 @@ define <4 x i32> @test_cttz_v4i32(<4 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %cttz ; %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %a, i1 true) @@ -85,7 +85,7 @@ define <2 x i16> @test_cttz_v2i16(<2 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i16> %cttz ; %cttz = call <2 x i16> @llvm.cttz.v2i16(<2 x i16> %a, i1 true) @@ -95,7 +95,7 @@ define <4 x i16> @test_cttz_v4i16(<4 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %cttz ; %cttz = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %a, i1 true) @@ -105,7 +105,7 @@ define <8 x i16> @test_cttz_v8i16(<8 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %cttz ; %cttz = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %a, i1 true) @@ -115,7 +115,7 @@ define <2 x i8> @test_cttz_v2i8(<2 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v2i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i8> %cttz ; %cttz = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %a, i1 true) @@ -125,7 +125,7 @@ define <4 x i8> @test_cttz_v4i8(<4 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %cttz ; %cttz = call <4 x i8> @llvm.cttz.v4i8(<4 x i8> %a, i1 true) @@ -135,7 +135,7 @@ define <8 x i8> @test_cttz_v8i8(<8 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %cttz ; %cttz = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %a, i1 true) @@ -145,7 +145,7 @@ define <16 x i8> @test_cttz_v16i8(<16 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %cttz ; %cttz = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %a, i1 true) @@ -155,7 +155,7 @@ define <4 x i64> @test_cttz_v4i64(<4 x i64> %a) { ; ; CHECK-LABEL: 'test_cttz_v4i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %cttz ; %cttz = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %a, i1 true) @@ -165,7 +165,7 @@ define <8 x i32> @test_cttz_v8i32(<8 x i32> %a) { ; ; CHECK-LABEL: 'test_cttz_v8i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %cttz ; %cttz = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %a, i1 true) @@ -175,7 +175,7 @@ define <16 x i16> @test_cttz_v16i16(<16 x i16> %a) { ; ; CHECK-LABEL: 'test_cttz_v16i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %cttz ; %cttz = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %a, i1 true) @@ -185,7 +185,7 @@ define <32 x i8> @test_cttz_v32i8(<32 x i8> %a) { ; ; CHECK-LABEL: 'test_cttz_v32i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %cttz ; %cttz = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %a, i1 true) Index: llvm/test/Analysis/CostModel/AArch64/div.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/div.ll +++ llvm/test/Analysis/CostModel/AArch64/div.ll @@ -6,21 +6,21 @@ define i32 @sdiv() { ; CHECK-LABEL: 'sdiv' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = sdiv i64 undef, undef @@ -49,21 +49,21 @@ define i32 @udiv() { ; CHECK-LABEL: 'udiv' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = udiv i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = udiv i64 undef, undef @@ -92,21 +92,21 @@ define i32 @sdiv_const() { ; CHECK-LABEL: 'sdiv_const' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = sdiv i64 undef, 7 @@ -135,21 +135,21 @@ define i32 @udiv_const() { ; CHECK-LABEL: 'udiv_const' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = udiv i64 undef, 7 @@ -264,21 +264,21 @@ define i32 @sdiv_constpow2() { ; CHECK-LABEL: 'sdiv_constpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = sdiv i64 undef, 16 @@ -307,21 +307,21 @@ define i32 @udiv_constpow2() { ; CHECK-LABEL: 'udiv_constpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = udiv i64 undef, 16 @@ -350,21 +350,21 @@ define i32 @sdiv_uniformconstpow2() { ; CHECK-LABEL: 'sdiv_uniformconstpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = sdiv i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2i64 = sdiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V4i64 = sdiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V8i64 = sdiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V2i64 = sdiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i64 = sdiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i64 = sdiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I32 = sdiv i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V4i32 = sdiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V8i32 = sdiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V16i32 = sdiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4i32 = sdiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %V8i32 = sdiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V16i32 = sdiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = sdiv i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V8i16 = sdiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %V16i16 = sdiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 244 for instruction: %V32i16 = sdiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %V8i16 = sdiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 102 for instruction: %V16i16 = sdiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 204 for instruction: %V32i16 = sdiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = sdiv i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V16i8 = sdiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %V32i8 = sdiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 500 for instruction: %V64i8 = sdiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V16i8 = sdiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %V32i8 = sdiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = sdiv i64 undef, 16 @@ -436,21 +436,21 @@ define i32 @sdiv_constnegpow2() { ; CHECK-LABEL: 'sdiv_constnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = sdiv i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = sdiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = sdiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = sdiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = sdiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = sdiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = sdiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = sdiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = sdiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = sdiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = sdiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = sdiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = sdiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = sdiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = sdiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = sdiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = sdiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = sdiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = sdiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = sdiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = sdiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = sdiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = sdiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = sdiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = sdiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = sdiv i64 undef, -16 @@ -479,21 +479,21 @@ define i32 @udiv_constnegpow2() { ; CHECK-LABEL: 'udiv_constnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = udiv i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = udiv <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = udiv <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = udiv <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2i64 = udiv <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4i64 = udiv <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V8i64 = udiv <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i32 = udiv <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i32 = udiv <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V16i32 = udiv <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i32 = udiv <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i32 = udiv <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16i32 = udiv <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V8i16 = udiv <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V16i16 = udiv <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 432 for instruction: %V32i16 = udiv <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = udiv <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = udiv <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = udiv <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = udiv i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 220 for instruction: %V16i8 = udiv <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 440 for instruction: %V32i8 = udiv <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 880 for instruction: %V64i8 = udiv <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %V16i8 = udiv <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 336 for instruction: %V32i8 = udiv <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %V64i8 = udiv <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = udiv i64 undef, -16 Index: llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll +++ llvm/test/Analysis/CostModel/AArch64/fptoi_sat.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef) @@ -233,46 +233,46 @@ ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef) ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef) ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 83 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 163 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 206 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 187 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 366 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 177 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 547 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 413 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 504 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 385 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 360 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 286 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 374 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 732 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 354 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 149 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 147 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 141 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 325 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 373 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 281 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 373 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 281 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 262 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 342 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 650 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 286 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; CHECK-NOFP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-FP16-LABEL: 'fp16' @@ -292,20 +292,20 @@ ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 100 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef) @@ -314,8 +314,8 @@ ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 198 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 206 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef) @@ -324,8 +324,8 @@ ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) -; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef) +; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef) ; CHECK-FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef) Index: llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll +++ llvm/test/Analysis/CostModel/AArch64/free-widening-casts.ll @@ -616,7 +616,7 @@ } ; COST-LABEL: neg_llegal_vector_type_3 -; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68> +; COST-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68> define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) { %tmp0 = zext <3 x i34> %a to <3 x i68> %tmp1 = add <3 x i68> %b, %tmp0 Index: llvm/test/Analysis/CostModel/AArch64/fshl.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/fshl.ll +++ llvm/test/Analysis/CostModel/AArch64/fshl.ll @@ -116,7 +116,7 @@ define <16 x i8> @fshl_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK-LABEL: 'fshl_v16i8_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl ; entry: @@ -148,7 +148,7 @@ define <8 x i16> @fshl_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; CHECK-LABEL: 'fshl_v8i16_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl ; entry: @@ -180,7 +180,7 @@ define <4 x i32> @fshl_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: 'fshl_v4i32_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl ; entry: @@ -212,7 +212,7 @@ define <2 x i64> @fshl_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: 'fshl_v2i64_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl ; entry: @@ -224,7 +224,7 @@ define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) { ; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshl ; entry: @@ -236,7 +236,7 @@ define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) { ; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshl ; entry: @@ -259,7 +259,7 @@ define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) { ; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshl = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshl ; entry: Index: llvm/test/Analysis/CostModel/AArch64/fshr.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/fshr.ll +++ llvm/test/Analysis/CostModel/AArch64/fshr.ll @@ -116,7 +116,7 @@ define <16 x i8> @fshr_v16i8_3rd_arg_var(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) { ; CHECK-LABEL: 'fshr_v16i8_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr ; entry: @@ -148,7 +148,7 @@ define <8 x i16> @fshr_v8i16_3rd_arg_var(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) { ; CHECK-LABEL: 'fshr_v8i16_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr ; entry: @@ -180,7 +180,7 @@ define <4 x i32> @fshr_v4i32_3rd_arg_var(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { ; CHECK-LABEL: 'fshr_v4i32_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr ; entry: @@ -212,7 +212,7 @@ define <2 x i64> @fshr_v2i64_3rd_arg_var(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) { ; CHECK-LABEL: 'fshr_v2i64_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr ; entry: @@ -224,7 +224,7 @@ define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) { ; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var' -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshr ; entry: @@ -236,7 +236,7 @@ define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) { ; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i66> %fshr ; entry: @@ -259,7 +259,7 @@ define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) { ; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different' -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> ) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %fshr = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> ) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %fshr ; entry: Index: llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll +++ llvm/test/Analysis/CostModel/AArch64/getIntrinsicInstrCost-vector-reverse.ll @@ -21,8 +21,8 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %12 = call <8 x float> @llvm.experimental.vector.reverse.v8f32(<8 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %13 = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = call <4 x double> @llvm.experimental.vector.reverse.v4f64(<4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %15 = call <8 x bfloat> @llvm.experimental.vector.reverse.v8bf16(<8 x bfloat> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %16 = call <16 x bfloat> @llvm.experimental.vector.reverse.v16bf16(<16 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; Index: llvm/test/Analysis/CostModel/AArch64/insert-extract.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/insert-extract.ll +++ llvm/test/Analysis/CostModel/AArch64/insert-extract.ll @@ -11,38 +11,38 @@ define void @vectorInstrCost() { ; CHECK-LABEL: 'vectorInstrCost' -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t1 = extractelement <8 x i8> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t2 = extractelement <8 x i8> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t3 = extractelement <4 x i16> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t4 = extractelement <4 x i16> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t5 = extractelement <2 x i32> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t6 = extractelement <2 x i32> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t7 = extractelement <2 x i64> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t8 = extractelement <2 x i64> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ta0 = extractelement <8 x i1> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ta1 = extractelement <8 x i1> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t1 = extractelement <8 x i8> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t2 = extractelement <8 x i8> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t3 = extractelement <4 x i16> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t4 = extractelement <4 x i16> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t5 = extractelement <2 x i32> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t6 = extractelement <2 x i32> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t7 = extractelement <2 x i64> undef, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t8 = extractelement <2 x i64> undef, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t9 = extractelement <4 x half> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t10 = extractelement <4 x half> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t10 = extractelement <4 x half> undef, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t11 = extractelement <2 x float> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t12 = extractelement <2 x float> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t12 = extractelement <2 x float> undef, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t13 = extractelement <2 x double> undef, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t14 = extractelement <2 x double> undef, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t14 = extractelement <2 x double> undef, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t31 = insertelement <8 x i1> undef, i1 false, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t41 = insertelement <8 x i1> undef, i1 true, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t30 = insertelement <8 x i8> undef, i8 0, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t40 = insertelement <8 x i8> undef, i8 1, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t50 = insertelement <4 x i16> undef, i16 2, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t60 = insertelement <4 x i16> undef, i16 3, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t70 = insertelement <2 x i32> undef, i32 4, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t80 = insertelement <2 x i32> undef, i32 5, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t90 = insertelement <2 x i64> undef, i64 6, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t100 = insertelement <2 x i64> undef, i64 7, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t110 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t120 = insertelement <4 x half> zeroinitializer, half 0xH0000, i64 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t130 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t140 = insertelement <2 x float> zeroinitializer, float 0.000000e+00, i64 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %t150 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 0 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %t160 = insertelement <2 x double> zeroinitializer, double 0.000000e+00, i64 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; KRYO-LABEL: 'vectorInstrCost' @@ -122,7 +122,7 @@ define <8 x i8> @LD1_B(<8 x i8> %vec, ptr noundef %i) { ; CHECK-LABEL: 'LD1_B' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i8, ptr %i, align 1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <8 x i8> %vec, i8 %v1, i32 1 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %v2 ; ; KRYO-LABEL: 'LD1_B' @@ -139,7 +139,7 @@ define <4 x i16> @LD1_H(<4 x i16> %vec, ptr noundef %i) { ; CHECK-LABEL: 'LD1_H' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i16, ptr %i, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i16> %vec, i16 %v1, i32 2 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %v2 ; ; KRYO-LABEL: 'LD1_H' @@ -156,7 +156,7 @@ define <4 x i32> @LD1_W(<4 x i32> %vec, ptr noundef %i) { ; CHECK-LABEL: 'LD1_W' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i32, ptr %i, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <4 x i32> %vec, i32 %v1, i32 3 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %v2 ; ; KRYO-LABEL: 'LD1_W' @@ -173,7 +173,7 @@ define <2 x i64> @LD1_X(<2 x i64> %vec, ptr noundef %i) { ; CHECK-LABEL: 'LD1_X' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v1 = load i64, ptr %i, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement <2 x i64> %vec, i64 %v1, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %v2 ; ; KRYO-LABEL: 'LD1_X' Index: llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll +++ llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll @@ -5,24 +5,24 @@ define void @fixed() { ; CHECK-LABEL: 'fixed' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 109 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 212 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; entry: Index: llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll +++ llvm/test/Analysis/CostModel/AArch64/mem-op-cost-model.ll @@ -190,11 +190,11 @@ define <4 x i8> @gather_load_4xi8_constant_mask(<4 x ptr> %ptrs) { ; CHECK: gather_load_4xi8_constant_mask ; CHECK-NEON-LABEL: 'gather_load_4xi8_constant_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_constant_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_constant_mask' @@ -212,11 +212,11 @@ define <4 x i8> @gather_load_4xi8_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi8_variable_mask ; CHECK-NEON-LABEL: 'gather_load_4xi8_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi8_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i8> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i8> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi8_variable_mask' @@ -235,11 +235,11 @@ define void @scatter_store_4xi8_constant_mask(<4 x i8> %val, <4 x ptr> %ptrs) { ; CHECK: scatter_store_4xi8_constant_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi8_constant_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_constant_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_constant_mask' @@ -257,11 +257,11 @@ define void @scatter_store_4xi8_variable_mask(<4 x i8> %val, <4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi8_variable_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi8_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi8_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi8_variable_mask' @@ -280,11 +280,11 @@ define <4 x i32> @gather_load_4xi32_constant_mask(<4 x ptr> %ptrs) { ; CHECK: gather_load_4xi32_constant_mask ; CHECK-NEON-LABEL: 'gather_load_4xi32_constant_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i32> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_constant_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i32> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> , <4 x i32> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_constant_mask' @@ -302,11 +302,11 @@ define <4 x i32> @gather_load_4xi32_variable_mask(<4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: gather_load_4xi32_variable_mask ; CHECK-NEON-LABEL: 'gather_load_4xi32_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-128-LABEL: 'gather_load_4xi32_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %lv = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 1, <4 x i1> %cond, <4 x i32> undef) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lv ; ; CHECK-SVE-256-LABEL: 'gather_load_4xi32_variable_mask' @@ -325,11 +325,11 @@ define void @scatter_store_4xi32_constant_mask(<4 x i32> %val, <4 x ptr> %ptrs) { ; CHECK: scatter_store_4xi32_constant_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi32_constant_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_constant_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> ) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_constant_mask' @@ -347,11 +347,11 @@ define void @scatter_store_4xi32_variable_mask(<4 x i32> %val, <4 x ptr> %ptrs, <4 x i1> %cond) { ; CHECK: scatter_store_4xi32_variable_mask ; CHECK-NEON-LABEL: 'scatter_store_4xi32_variable_mask' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'scatter_store_4xi32_variable_mask' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %ptrs, i32 1, <4 x i1> %cond) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'scatter_store_4xi32_variable_mask' @@ -370,11 +370,11 @@ define void @sve_gather_vls(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_scatter_vls' ; CHECK-NEON-LABEL: 'sve_gather_vls' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_gather_vls' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: %res.v256i16 = call <256 x i16> @llvm.masked.gather.v256i16.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x i16> zeroinitializer) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_gather_vls' @@ -394,11 +394,11 @@ define void @sve_gather_vls_float(<256 x i1> %v256i1mask) { ; CHECK-LABEL: 'sve_gather_vls_float' ; CHECK-NEON-LABEL: 'sve_gather_vls_float' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_gather_vls_float' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1856 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1664 for instruction: %res.v256f32 = call <256 x float> @llvm.masked.gather.v256f32.v256p0(<256 x ptr> undef, i32 0, <256 x i1> %v256i1mask, <256 x float> zeroinitializer) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_gather_vls_float' @@ -418,11 +418,11 @@ define void @sve_scatter_vls(<256 x i1> %v256i1mask){ ; CHECK-LABEL: 'sve_scatter_vls' ; CHECK-NEON-LABEL: 'sve_scatter_vls' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_scatter_vls' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 2000 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 1792 for instruction: call void @llvm.masked.scatter.v256i8.v256p0(<256 x i8> undef, <256 x ptr> undef, i32 0, <256 x i1> %v256i1mask) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_scatter_vls' @@ -442,11 +442,11 @@ define void @sve_scatter_vls_float(<512 x i1> %v512i1mask){ ; CHECK-LABEL: 'sve_scatter_vls_float' ; CHECK-NEON-LABEL: 'sve_scatter_vls_float' -; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-128-LABEL: 'sve_scatter_vls_float' -; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 3904 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) +; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 3456 for instruction: call void @llvm.masked.scatter.v512f16.v512p0(<512 x half> undef, <512 x ptr> undef, i32 0, <512 x i1> %v512i1mask) ; CHECK-SVE-128-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SVE-256-LABEL: 'sve_scatter_vls_float' Index: llvm/test/Analysis/CostModel/AArch64/min-max.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/min-max.ll +++ llvm/test/Analysis/CostModel/AArch64/min-max.ll @@ -195,10 +195,10 @@ define void @minnum16() { ; CHECK-NOF16-LABEL: 'minnum16' ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minnum.f16(half undef, half undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'minnum16' @@ -220,10 +220,10 @@ define void @maxnum16() { ; CHECK-NOF16-LABEL: 'maxnum16' ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maxnum.f16(half undef, half undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'maxnum16' @@ -288,10 +288,10 @@ define void @minimum16() { ; CHECK-NOF16-LABEL: 'minimum16' ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.minimum.f16(half undef, half undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.minimum.v2f16(<2 x half> undef, <2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.minimum.v4f16(<4 x half> undef, <4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.minimum.v8f16(<8 x half> undef, <8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.minimum.v16f16(<16 x half> undef, <16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'minimum16' @@ -313,10 +313,10 @@ define void @maximum16() { ; CHECK-NOF16-LABEL: 'maximum16' ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f16 = call half @llvm.maximum.f16(half undef, half undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2f16 = call <2 x half> @llvm.maximum.v2f16(<2 x half> undef, <2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4f16 = call <4 x half> @llvm.maximum.v4f16(<4 x half> undef, <4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8f16 = call <8 x half> @llvm.maximum.v8f16(<8 x half> undef, <8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V16f16 = call <16 x half> @llvm.maximum.v16f16(<16 x half> undef, <16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'maximum16' Index: llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll +++ llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -7,35 +7,35 @@ define void @strict_fp_reductions() { ; CHECK-LABEL: 'strict_fp_reductions' -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; FP16-LABEL: 'strict_fp_reductions' -; FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; BF16-LABEL: 'strict_fp_reductions' -; BF16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f16 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v8f16 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f8 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -58,20 +58,20 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -81,20 +81,20 @@ ; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -104,20 +104,20 @@ ; BF16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void Index: llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll +++ llvm/test/Analysis/CostModel/AArch64/reduce-minmax.ll @@ -162,14 +162,14 @@ define void @reduce_fmin16() { ; CHECK-NOF16-LABEL: 'reduce_fmin16' -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fminimum.v2f16(<2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fminimum.v4f16(<4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fminimum.v8f16(<8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fminimum.v16f16(<16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'reduce_fmin16' @@ -196,14 +196,14 @@ define void @reduce_fmax16() { ; CHECK-NOF16-LABEL: 'reduce_fmax16' -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 350 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 237 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef) -; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 350 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2f16 = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4f16 = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V8f16 = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V16f16 = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2f16m = call half @llvm.vector.reduce.fmaximum.v2f16(<2 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %V4f16m = call half @llvm.vector.reduce.fmaximum.v4f16(<4 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V8f16m = call half @llvm.vector.reduce.fmaximum.v8f16(<8 x half> undef) +; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 252 for instruction: %V16f16m = call half @llvm.vector.reduce.fmaximum.v16f16(<16 x half> undef) ; CHECK-NOF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-F16-LABEL: 'reduce_fmax16' Index: llvm/test/Analysis/CostModel/AArch64/rem.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/rem.ll +++ llvm/test/Analysis/CostModel/AArch64/rem.ll @@ -6,21 +6,21 @@ define i32 @srem() { ; CHECK-LABEL: 'srem' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = srem i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, undef @@ -49,21 +49,21 @@ define i32 @urem() { ; CHECK-LABEL: 'urem' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = urem i64 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, undef @@ -92,21 +92,21 @@ define i32 @srem_const() { ; CHECK-LABEL: 'srem_const' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 7 @@ -135,21 +135,21 @@ define i32 @urem_const() { ; CHECK-LABEL: 'urem_const' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 7 @@ -178,21 +178,21 @@ define i32 @srem_uniformconst() { ; CHECK-LABEL: 'srem_uniformconst' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 7 @@ -221,21 +221,21 @@ define i32 @urem_uniformconst() { ; CHECK-LABEL: 'urem_uniformconst' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 7 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 7 @@ -264,21 +264,21 @@ define i32 @srem_constpow2() { ; CHECK-LABEL: 'srem_constpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 16 @@ -307,21 +307,21 @@ define i32 @urem_constpow2() { ; CHECK-LABEL: 'urem_constpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 16 @@ -350,21 +350,21 @@ define i32 @srem_uniformconstpow2() { ; CHECK-LABEL: 'srem_uniformconstpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I64 = srem i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = srem i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = srem i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 196 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 392 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = srem i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 404 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 704 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, 16 @@ -393,21 +393,21 @@ define i32 @urem_uniformconstpow2() { ; CHECK-LABEL: 'urem_uniformconstpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, 16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, 16 @@ -436,21 +436,21 @@ define i32 @srem_constnegpow2() { ; CHECK-LABEL: 'srem_constnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, -16 @@ -479,21 +479,21 @@ define i32 @urem_constnegpow2() { ; CHECK-LABEL: 'urem_constnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, -16 @@ -522,21 +522,21 @@ define i32 @srem_uniformconstnegpow2() { ; CHECK-LABEL: 'srem_uniformconstnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = srem i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = srem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = srem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = srem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = srem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = srem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = srem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = srem i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = srem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = srem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = srem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = srem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = srem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = srem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = srem i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = srem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = srem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = srem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = srem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = srem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = srem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = srem i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = srem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = srem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = srem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = srem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = srem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = srem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = srem i64 undef, -16 @@ -565,21 +565,21 @@ define i32 @urem_uniformconstnegpow2() { ; CHECK-LABEL: 'urem_uniformconstnegpow2' ; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %I64 = urem i64 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2i64 = urem <2 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V4i64 = urem <4 x i64> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V8i64 = urem <8 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V2i64 = urem <2 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V4i64 = urem <4 x i64> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %V8i64 = urem <8 x i64> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = urem i32 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V4i32 = urem <4 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8i32 = urem <8 x i32> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V16i32 = urem <16 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V4i32 = urem <4 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i32 = urem <8 x i32> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i32 = urem <16 x i32> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = urem i16 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V8i16 = urem <8 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V16i16 = urem <16 x i16> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V32i16 = urem <32 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V8i16 = urem <8 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i16 = urem <16 x i16> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i16 = urem <32 x i16> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = urem i8 undef, -16 -; CHECK-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16i8 = urem <16 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 276 for instruction: %V32i8 = urem <32 x i8> undef, -; CHECK-NEXT: Cost Model: Found an estimated cost of 552 for instruction: %V64i8 = urem <64 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V16i8 = urem <16 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V32i8 = urem <32 x i8> undef, +; CHECK-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V64i8 = urem <64 x i8> undef, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %I64 = urem i64 undef, -16 Index: llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll +++ llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll @@ -265,13 +265,13 @@ define <8 x i8> @ld1r_8b_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_8b_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %lane ; ; CODESIZE-LABEL: 'ld1r_8b_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i8> undef, i8 %tmp, i8 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %lane ; @@ -285,13 +285,13 @@ define <16 x i8> @ld1r_16b_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_16b_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %lane ; ; CODESIZE-LABEL: 'ld1r_16b_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i8, ptr %x, align 2 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <16 x i8> undef, i8 %tmp, i8 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %lane ; @@ -305,13 +305,13 @@ define <4 x i16> @ld1r_4h_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_4h_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %lane ; ; CODESIZE-LABEL: 'ld1r_4h_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i16> undef, i16 %tmp, i16 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %lane ; @@ -325,13 +325,13 @@ define <8 x i16> @ld1r_8h_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_8h_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %lane ; ; CODESIZE-LABEL: 'ld1r_8h_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i16, ptr %x, align 2 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <8 x i16> undef, i16 %tmp, i16 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %lane ; @@ -345,13 +345,13 @@ define <2 x i32> @ld1r_2s_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_2s_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i32> %lane ; ; CODESIZE-LABEL: 'ld1r_2s_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i32> undef, i32 %tmp, i32 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i32> %lane ; @@ -365,13 +365,13 @@ define <4 x i32> @ld1r_4s_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_4s_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %lane ; ; CODESIZE-LABEL: 'ld1r_4s_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i32, ptr %x, align 4 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <4 x i32> undef, i32 %tmp, i32 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %lane ; @@ -385,13 +385,13 @@ define <2 x i64> @ld1r_2d_int_shuff(ptr nocapture %x) { ; CHECK-LABEL: 'ld1r_2d_int_shuff' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8 -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %lane ; ; CODESIZE-LABEL: 'ld1r_2d_int_shuff' ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp = load i64, ptr %x, align 8 -; CODESIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 +; CODESIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp1 = insertelement <2 x i64> undef, i64 %tmp, i32 0 ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer ; CODESIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %lane ; Index: llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll +++ llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll @@ -87,42 +87,42 @@ define void @insert_subvec() { ; CHECK-LABEL: 'insert_subvec' -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_2_1 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8i8_2_0 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> @@ -169,16 +169,16 @@ define void @multipart() { ; CHECK-LABEL: 'multipart' -; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v16a = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16b = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 50 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v16c = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v16d = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v32a = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a4 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> Index: llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll +++ llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll @@ -4,7 +4,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" ; COST-LABEL: sel.v8i8 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; COST: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CODE-LABEL: sel.v8i8 ; CODE: tbl v0.8b, { v0.16b }, v1.8b define <8 x i8> @sel.v8i8(<8 x i8> %v0, <8 x i8> %v1) { @@ -13,7 +13,7 @@ } ; COST-LABEL: sel.v16i8 -; COST: Found an estimated cost of 90 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; COST: Found an estimated cost of 60 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CODE-LABEL: sel.v16i8 ; CODE: tbl v0.16b, { v0.16b, v1.16b }, v2.16b define <16 x i8> @sel.v16i8(<16 x i8> %v0, <16 x i8> %v1) { @@ -32,7 +32,7 @@ } ; COST-LABEL: sel.v8i16 -; COST: Found an estimated cost of 42 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; COST: Found an estimated cost of 28 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CODE-LABEL: sel.v8i16 ; CODE: tbl v0.16b, { v0.16b, v1.16b }, v2.16b define <8 x i16> @sel.v8i16(<8 x i16> %v0, <8 x i16> %v1) { Index: llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll +++ llvm/test/Analysis/CostModel/AArch64/sve-insert-extract.ll @@ -9,11 +9,11 @@ define void @ins_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el0' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -50,13 +50,13 @@ define void @ins_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ins_el1' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = insertelement zeroinitializer, i1 false, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = insertelement zeroinitializer, i8 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = insertelement zeroinitializer, i16 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement zeroinitializer, i32 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = insertelement zeroinitializer, i64 0, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = insertelement zeroinitializer, float 0.000000e+00, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = insertelement zeroinitializer, double 0.000000e+00, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ins_el1' @@ -92,11 +92,11 @@ define void @ext_el0() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el0' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement zeroinitializer, i64 0 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 0 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v5 = extractelement zeroinitializer, i64 0 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -133,13 +133,13 @@ define void @ext_el1() #0 { ; CHECK-DEFAULT-LABEL: 'ext_el1' -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %vi1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v0 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v1 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v3 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v4 = extractelement zeroinitializer, i64 1 -; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %vi1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v0 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = extractelement zeroinitializer, i64 1 +; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v5 = extractelement zeroinitializer, i64 1 ; CHECK-DEFAULT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-LOW-LABEL: 'ext_el1' Index: llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll +++ llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll @@ -6,8 +6,8 @@ define void @vector_insert_extract( %v0, %v1, <16 x i32> %v2) { ; CHECK-LABEL: 'vector_insert_extract' -; CHECK-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) -; CHECK-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( %v0, i64 0) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %insert_fixed_into_scalable = call @llvm.vector.insert.nxv4i32.v16i32( %v0, <16 x i32> %v2, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %extract_scalable_from_scalable = call @llvm.vector.extract.nxv4i32.nxv16i32( %v1, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %insert_scalable_into_scalable = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %v0, i64 0) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -618,7 +618,7 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -633,16 +633,16 @@ ; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv2i1_i32 = call @llvm.get.active.lane.mask.nxv2i1.i32(i32 undef, i32 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Invalid cost for instruction: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef) @@ -746,11 +746,11 @@ define <4 x i32> @masked_gather_v4i32(<4 x ptr> %ld, <4 x i1> %masks, <4 x i32> %passthru) { ; CHECK-LABEL: 'masked_gather_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v4i32' -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res ; %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) @@ -759,7 +759,7 @@ define <1 x i128> @masked_gather_v1i128(<1 x ptr> %ld, <1 x i1> %masks, <1 x i128> %passthru) { ; CHECK-LABEL: 'masked_gather_v1i128' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <1 x i128> @llvm.masked.gather.v1i128.v1p0(<1 x ptr> %ld, i32 0, <1 x i1> %masks, <1 x i128> %passthru) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <1 x i128> %res ; ; TYPE_BASED_ONLY-LABEL: 'masked_gather_v1i128' @@ -800,11 +800,11 @@ define void @masked_scatter_v4i32(<4 x i32> %data, <4 x ptr> %ptrs, <4 x i1> %masks) { ; CHECK-LABEL: 'masked_scatter_v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v4i32' -; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 31 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) +; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 28 for instruction: call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %data, <4 x ptr> %ptrs, i32 0, <4 x i1> %masks) ; TYPE_BASED_ONLY-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -814,7 +814,7 @@ define void @masked_scatter_v1i128(<1 x i128> %data, <1 x ptr> %ptrs, <1 x i1> %masks) { ; CHECK-LABEL: 'masked_scatter_v1i128' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v1i128.v1p0(<1 x i128> %data, <1 x ptr> %ptrs, i32 0, <1 x i1> %masks) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; TYPE_BASED_ONLY-LABEL: 'masked_scatter_v1i128' Index: llvm/test/Analysis/CostModel/AArch64/vector-select.ll =================================================================== --- llvm/test/Analysis/CostModel/AArch64/vector-select.ll +++ llvm/test/Analysis/CostModel/AArch64/vector-select.ll @@ -140,7 +140,7 @@ } ; COST-LABEL: v2i64_select_no_cmp -; COST-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b +; COST-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %s.1 = select <2 x i1> %cond, <2 x i64> %a, <2 x i64> %b ; CODE-LABEL: v2i64_select_no_cmp ; CODE: bb.0 @@ -175,7 +175,7 @@ define <8 x half> @v8f16_select_ogt(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_ogt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp ogt <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp ogt <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ogt <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -259,7 +259,7 @@ define <8 x half> @v8f16_select_oge(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_oge -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp oge <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp oge <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oge <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -343,7 +343,7 @@ define <8 x half> @v8f16_select_olt(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_olt -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp olt <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp olt <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp olt <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -427,7 +427,7 @@ define <8 x half> @v8f16_select_ole(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_ole -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp ole <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp ole <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp ole <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -511,7 +511,7 @@ define <8 x half> @v8f16_select_oeq(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_oeq -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp oeq <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp oeq <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp oeq <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -597,7 +597,7 @@ define <8 x half> @v8f16_select_one(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_one -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp one <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp one <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp one <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c @@ -689,7 +689,7 @@ define <8 x half> @v8f16_select_une(<8 x half> %a, <8 x half> %b, <8 x half> %c) { ; COST-LABEL: v8f16_select_une -; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %cmp.1 = fcmp une <8 x half> %a, %b +; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %cmp.1 = fcmp une <8 x half> %a, %b ; COST-NOFP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cmp.1 = fcmp une <8 x half> %a, %b ; COST-FULLFP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s.1 = select <8 x i1> %cmp.1, <8 x half> %a, <8 x half> %c Index: llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll @@ -13,7 +13,7 @@ ; %var4 a lower scalarization overhead. ; ; COST-LABEL: predicated_udiv_scalarized_operand -; COST: LV: Found an estimated cost of 4 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3 +; COST: LV: Found an estimated cost of 5 for VF 2 For instruction: %var4 = udiv i64 %var2, %var3 ; ; define i64 @predicated_udiv_scalarized_operand(ptr %a, i64 %x) optsize { Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -7,7 +7,7 @@ %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 14 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK-LABEL: entry: ; CHECK-LABEL: vector.body: Index: llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll @@ -168,10 +168,10 @@ ; gaps. ; ; VF_2-LABEL: Checking a loop in 'i64_factor_8' -; VF_2: Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 +; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp2 = load i64, ptr %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, ptr %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 7 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp0, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i64 0, ptr %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.8, ptr %data, i64 %i, i32 2 Index: llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -5,8 +5,8 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-COST: Checking a loop in 'fixed_width' -; CHECK-COST: Found an estimated cost of 11 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4 -; CHECK-COST: Found an estimated cost of 25 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4 +; CHECK-COST: Found an estimated cost of 12 for VF 2 For instruction: store i32 2, ptr %arrayidx1, align 4 +; CHECK-COST: Found an estimated cost of 24 for VF 4 For instruction: store i32 2, ptr %arrayidx1, align 4 ; CHECK-COST: Selecting VF: 1. ; We should decide this loop is not worth vectorising using fixed width vectors Index: llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/predication_costs.ll @@ -16,10 +16,10 @@ ; as: ; ; Cost of udiv: -; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv(ptr %a, ptr %b, i1 %c, i64 %n) { entry: @@ -57,10 +57,10 @@ ; as: ; ; Cost of store: -; (store(4) + extractelement(3)) / 2 = 3 +; (store(4) + extractelement(4)) / 2 = 4 ; ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -94,7 +94,7 @@ ; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, ptr %addr, i64 1 ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %addr, align 4 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi ptr [ %a, %entry ], [ %addr.next, %for.inc ] -; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4 +; CHECK: Found an estimated cost of 4 for VF 2 For instruction: store i32 %tmp2, ptr %addr, align 4 ; define void @predicated_store_phi(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -129,14 +129,14 @@ ; compute the cost as: ; ; Cost of add: -; (add(2) + extractelement(3)) / 2 = 2 +; (add(2) + extractelement(4)) / 2 = 3 ; Cost of udiv: -; (udiv(2) + extractelement(3) + insertelement(3)) / 2 = 4 +; (udiv(2) + extractelement(4) + insertelement(4)) / 2 = 5 ; ; CHECK: Scalarizing: %tmp3 = add nsw i32 %tmp2, %x ; CHECK: Scalarizing and predicating: %tmp4 = udiv i32 %tmp2, %tmp3 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x -; CHECK: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp3 = add nsw i32 %tmp2, %x +; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp2, %tmp3 ; define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { entry: @@ -174,13 +174,13 @@ ; compute the cost as: ; ; Cost of add: -; (add(2) + extractelement(3)) / 2 = 2 +; (add(2) + extractelement(4)) / 2 = 3 ; Cost of store: ; store(4) / 2 = 2 ; ; CHECK: Scalarizing: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Scalarizing and predicating: store i32 %tmp2, ptr %tmp0, align 4 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp2 = add nsw i32 %tmp1, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp2, ptr %tmp0, align 4 ; define void @predicated_store_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) { @@ -219,11 +219,11 @@ ; Cost of add: ; add(1) = 1 ; Cost of sdiv: -; (sdiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (sdiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; Cost of udiv: -; (udiv(2) + extractelement(6) + insertelement(3)) / 2 = 5 +; (udiv(2) + extractelement(8) + insertelement(4)) / 2 = 7 ; Cost of sub: -; (sub(2) + extractelement(3)) / 2 = 2 +; (sub(2) + extractelement(4)) / 2 = 3 ; Cost of store: ; store(4) / 2 = 2 ; @@ -233,9 +233,9 @@ ; CHECK: Scalarizing: %tmp5 = sub i32 %tmp4, %x ; CHECK: Scalarizing and predicating: store i32 %tmp5, ptr %tmp0, align 4 ; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %tmp2 = add i32 %tmp1, %x -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 -; CHECK: Found an estimated cost of 5 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 -; CHECK: Found an estimated cost of 2 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x +; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp3 = sdiv i32 %tmp1, %tmp2 +; CHECK: Found an estimated cost of 7 for VF 2 For instruction: %tmp4 = udiv i32 %tmp3, %tmp2 +; CHECK: Found an estimated cost of 3 for VF 2 For instruction: %tmp5 = sub i32 %tmp4, %x ; CHECK: Found an estimated cost of 2 for VF 2 For instruction: store i32 %tmp5, ptr %tmp0, align 4 ; define void @predication_multi_context(ptr %a, i1 %c, i32 %x, i64 %n) { Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll @@ -6,8 +6,8 @@ target triple="aarch64-unknown-linux-gnu" -; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction: %add = fadd float %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd float %0, %sum.07 define float @fadd_strict32(ptr noalias nocapture readonly %a, i64 %n) { entry: @@ -28,8 +28,8 @@ } -; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction: %add = fadd double %0, %sum.07 -; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF4: Found an estimated cost of 12 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 24 for VF 8 For instruction: %add = fadd double %0, %sum.07 define double @fadd_strict64(ptr noalias nocapture readonly %a, i64 %n) { entry: @@ -49,8 +49,8 @@ ret double %add } -; CHECK-VF4: Found an estimated cost of 19 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) -; CHECK-VF8: Found an estimated cost of 38 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) +; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) +; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction: %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07) define float @fmuladd_strict32(ptr %a, ptr %b, i64 %n) { entry: @@ -74,8 +74,8 @@ declare float @llvm.fmuladd.f32(float, float, float) -; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) -; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF4: Found an estimated cost of 16 for VF 4 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) +; CHECK-VF8: Found an estimated cost of 32 for VF 8 For instruction: %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07) define double @fmuladd_strict64(ptr %a, ptr %b, i64 %n) { entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll @@ -1,14 +1,13 @@ ; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s -; Specify a large unsafe vectorization factor of 32 that gets clamped to 16, -; then test an even smaller VF of 2 is selected based on the cost-model. +; Specify a large unsafe vectorization factor of 32 that gets clamped to 16. ; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16. ; CHECK: remark: :0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16 -; CHECK: LV: Selecting VF: 2. +; CHECK: LV: Selecting VF: 16. ; CHECK-LABEL: @test -; CHECK: <2 x i64> +; CHECK: <16 x i64> define void @test(ptr nocapture %a, ptr nocapture readonly %b) { entry: br label %loop.header Index: llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll =================================================================== --- llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll +++ llvm/test/Transforms/LowerMatrixIntrinsics/dot-product-float.ll @@ -5,46 +5,10 @@ define <1 x float> @dotproduct_float_v6(<6 x float> %a, <6 x float> %b) { ; CHECK-LABEL: @dotproduct_float_v6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x float> [[A:%.*]], <6 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT2:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT3:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT4:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT5:%.*]] = shufflevector <6 x float> [[A]], <6 x float> poison, <1 x i32> -; CHECK-NEXT: [[SPLIT6:%.*]] = shufflevector <6 x float> [[B:%.*]], <6 x float> poison, <6 x i32> -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x float> [[SPLIT]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <1 x float> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK7:%.*]] = shufflevector <1 x float> [[SPLIT1]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT8:%.*]] = insertelement <1 x float> poison, float [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT9:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT8]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK7]], <1 x float> [[SPLAT_SPLAT9]], <1 x float> [[TMP1]]) -; CHECK-NEXT: [[BLOCK10:%.*]] = shufflevector <1 x float> [[SPLIT2]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 2 -; CHECK-NEXT: [[SPLAT_SPLATINSERT11:%.*]] = insertelement <1 x float> poison, float [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT12:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT11]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK10]], <1 x float> [[SPLAT_SPLAT12]], <1 x float> [[TMP3]]) -; CHECK-NEXT: [[BLOCK13:%.*]] = shufflevector <1 x float> [[SPLIT3]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 3 -; CHECK-NEXT: [[SPLAT_SPLATINSERT14:%.*]] = insertelement <1 x float> poison, float [[TMP6]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT15:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT14]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK13]], <1 x float> [[SPLAT_SPLAT15]], <1 x float> [[TMP5]]) -; CHECK-NEXT: [[BLOCK16:%.*]] = shufflevector <1 x float> [[SPLIT4]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 4 -; CHECK-NEXT: [[SPLAT_SPLATINSERT17:%.*]] = insertelement <1 x float> poison, float [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT18:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT17]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK16]], <1 x float> [[SPLAT_SPLAT18]], <1 x float> [[TMP7]]) -; CHECK-NEXT: [[BLOCK19:%.*]] = shufflevector <1 x float> [[SPLIT5]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x float> [[SPLIT6]], i64 5 -; CHECK-NEXT: [[SPLAT_SPLATINSERT20:%.*]] = insertelement <1 x float> poison, float [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT21:%.*]] = shufflevector <1 x float> [[SPLAT_SPLATINSERT20]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call fast <1 x float> @llvm.fmuladd.v1f32(<1 x float> [[BLOCK19]], <1 x float> [[SPLAT_SPLAT21]], <1 x float> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x float> [[TMP11]], <1 x float> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x float> poison, <1 x float> [[TMP12]], <1 x i32> -; CHECK-NEXT: ret <1 x float> [[TMP13]] +; CHECK-NEXT: [[TMP0:%.*]] = fmul <6 x float> [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v6f32(float 0.000000e+00, <6 x float> [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <1 x float> poison, float [[TMP1]], i64 0 +; CHECK-NEXT: ret <1 x float> [[TMP2]] ; entry: %c = tail call fast <1 x float> @llvm.matrix.multiply.v1f32.v6f32.v6f32(<6 x float> %a, <6 x float> %b, i32 1, i32 6, i32 1) @@ -175,51 +139,12 @@ define <1 x double> @intrinsic_column_major_load_dot_product_double_v6(ptr %lhs_address, ptr %rhs_address) { ; CHECK-LABEL: @intrinsic_column_major_load_dot_product_double_v6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, ptr [[LHS_ADDRESS:%.*]], align 4 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 1 -; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[VEC_GEP]], align 4 -; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 2 -; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[VEC_GEP2]], align 4 -; CHECK-NEXT: [[VEC_GEP4:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 3 -; CHECK-NEXT: [[COL_LOAD5:%.*]] = load <1 x double>, ptr [[VEC_GEP4]], align 4 -; CHECK-NEXT: [[VEC_GEP6:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 4 -; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <1 x double>, ptr [[VEC_GEP6]], align 4 -; CHECK-NEXT: [[VEC_GEP8:%.*]] = getelementptr double, ptr [[LHS_ADDRESS]], i64 5 -; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[VEC_GEP8]], align 4 -; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4 -; CHECK-NEXT: [[BLOCK:%.*]] = shufflevector <1 x double> [[COL_LOAD]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> poison, double [[TMP0]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = fmul fast <1 x double> [[BLOCK]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[BLOCK11:%.*]] = shufflevector <1 x double> [[COL_LOAD1]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 1 -; CHECK-NEXT: [[SPLAT_SPLATINSERT12:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT12]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK11]], <1 x double> [[SPLAT_SPLAT13]], <1 x double> [[TMP1]]) -; CHECK-NEXT: [[BLOCK14:%.*]] = shufflevector <1 x double> [[COL_LOAD3]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 2 -; CHECK-NEXT: [[SPLAT_SPLATINSERT15:%.*]] = insertelement <1 x double> poison, double [[TMP4]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT15]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK14]], <1 x double> [[SPLAT_SPLAT16]], <1 x double> [[TMP3]]) -; CHECK-NEXT: [[BLOCK17:%.*]] = shufflevector <1 x double> [[COL_LOAD5]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 3 -; CHECK-NEXT: [[SPLAT_SPLATINSERT18:%.*]] = insertelement <1 x double> poison, double [[TMP6]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT18]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK17]], <1 x double> [[SPLAT_SPLAT19]], <1 x double> [[TMP5]]) -; CHECK-NEXT: [[BLOCK20:%.*]] = shufflevector <1 x double> [[COL_LOAD7]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 4 -; CHECK-NEXT: [[SPLAT_SPLATINSERT21:%.*]] = insertelement <1 x double> poison, double [[TMP8]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT21]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK20]], <1 x double> [[SPLAT_SPLAT22]], <1 x double> [[TMP7]]) -; CHECK-NEXT: [[BLOCK23:%.*]] = shufflevector <1 x double> [[COL_LOAD9]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <6 x double> [[COL_LOAD10]], i64 5 -; CHECK-NEXT: [[SPLAT_SPLATINSERT24:%.*]] = insertelement <1 x double> poison, double [[TMP10]], i64 0 -; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT24]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = call fast <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[BLOCK23]], <1 x double> [[SPLAT_SPLAT25]], <1 x double> [[TMP9]]) -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <1 x double> [[TMP11]], <1 x double> poison, <1 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <1 x double> poison, <1 x double> [[TMP12]], <1 x i32> -; CHECK-NEXT: ret <1 x double> [[TMP13]] +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <6 x double>, ptr [[RHS_ADDRESS:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <6 x double>, ptr [[LHS_ADDRESS:%.*]], align 64 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <6 x double> [[TMP0]], [[COL_LOAD]] +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fadd.v6f64(double 0.000000e+00, <6 x double> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <1 x double> poison, double [[TMP2]], i64 0 +; CHECK-NEXT: ret <1 x double> [[TMP3]] ; entry: %lhs = tail call fast <6 x double> @llvm.matrix.column.major.load.v6f64.i64(ptr nonnull align 4 %lhs_address, i64 1, i1 false, i32 1, i32 6) Index: llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -12,18 +12,21 @@ ; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> ; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32> ; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]] -; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 -; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0 +; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sext <2 x i32> [[TMP0]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]] +; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 +; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[S3]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4 ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void @@ -58,23 +61,25 @@ ; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> ; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32> ; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]] -; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64> -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[C0:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0 -; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0 +; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 +; CHECK-NEXT: [[A0:%.*]] = add i64 [[S0]], [[C0:%.*]] +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[A0]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 1 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP7]] +; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 +; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 +; CHECK-NEXT: [[A1:%.*]] = add i64 [[S1]], [[C1:%.*]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A1]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP8]] +; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 +; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 +; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A2]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i32 3 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]] +; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 +; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 +; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]] +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[A3]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, ptr [[GEP3]], align 4 ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void Index: llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -10,7 +10,7 @@ ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' -; REMARK-NEXT: - Cost: '-7' +; REMARK-NEXT: - Cost: '-8' ; ; REMARK-NOT: Function: gather_load Index: llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -30,7 +30,7 @@ ; YAML-NEXT: Function: getelementptr_4x32 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '6' +; YAML-NEXT: - Cost: '4' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' Index: llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/landing_pad.ll @@ -8,7 +8,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '3' +; YAML-NEXT: - Cost: '2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '2' @@ -28,7 +28,7 @@ ; YAML-NEXT: Function: foo ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '1' +; YAML-NEXT: - Cost: '2' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '9' Index: llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/matmul.ll @@ -11,66 +11,46 @@ define void @wrap_mul4(ptr nocapture %Out, ptr nocapture readonly %A, ptr nocapture readonly %B) { ; CHECK-LABEL: @wrap_mul4( ; CHECK-NEXT: [[TEMP:%.*]] = load double, ptr [[A:%.*]], align 8 -; CHECK-NEXT: [[TEMP1:%.*]] = load double, ptr [[B:%.*]], align 8 -; CHECK-NEXT: [[MUL_I:%.*]] = fmul double [[TEMP]], [[TEMP1]] ; CHECK-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 0, i64 1 ; CHECK-NEXT: [[TEMP2:%.*]] = load double, ptr [[ARRAYIDX5_I]], align 8 -; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 0 -; CHECK-NEXT: [[TEMP3:%.*]] = load double, ptr [[ARRAYIDX7_I]], align 8 -; CHECK-NEXT: [[MUL8_I:%.*]] = fmul double [[TEMP2]], [[TEMP3]] -; CHECK-NEXT: [[ADD_I:%.*]] = fadd double [[MUL_I]], [[MUL8_I]] -; CHECK-NEXT: [[ARRAYIDX13_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 1 -; CHECK-NEXT: [[TEMP4:%.*]] = load double, ptr [[ARRAYIDX13_I]], align 8 -; CHECK-NEXT: [[MUL14_I:%.*]] = fmul double [[TEMP]], [[TEMP4]] -; CHECK-NEXT: [[ARRAYIDX18_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 1 -; CHECK-NEXT: [[TEMP5:%.*]] = load double, ptr [[ARRAYIDX18_I]], align 8 -; CHECK-NEXT: [[MUL19_I:%.*]] = fmul double [[TEMP2]], [[TEMP5]] -; CHECK-NEXT: [[ADD20_I:%.*]] = fadd double [[MUL14_I]], [[MUL19_I]] +; CHECK-NEXT: [[ARRAYIDX7_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B:%.*]], i64 1, i64 0 ; CHECK-NEXT: [[ARRAYIDX25_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 2 -; CHECK-NEXT: [[TEMP6:%.*]] = load double, ptr [[ARRAYIDX25_I]], align 8 -; CHECK-NEXT: [[MUL26_I:%.*]] = fmul double [[TEMP]], [[TEMP6]] ; CHECK-NEXT: [[ARRAYIDX30_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 2 -; CHECK-NEXT: [[TEMP7:%.*]] = load double, ptr [[ARRAYIDX30_I]], align 8 -; CHECK-NEXT: [[MUL31_I:%.*]] = fmul double [[TEMP2]], [[TEMP7]] -; CHECK-NEXT: [[ADD32_I:%.*]] = fadd double [[MUL26_I]], [[MUL31_I]] -; CHECK-NEXT: [[ARRAYIDX37_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 0, i64 3 -; CHECK-NEXT: [[TEMP8:%.*]] = load double, ptr [[ARRAYIDX37_I]], align 8 -; CHECK-NEXT: [[MUL38_I:%.*]] = fmul double [[TEMP]], [[TEMP8]] -; CHECK-NEXT: [[ARRAYIDX42_I:%.*]] = getelementptr inbounds [4 x double], ptr [[B]], i64 1, i64 3 -; CHECK-NEXT: [[TEMP9:%.*]] = load double, ptr [[ARRAYIDX42_I]], align 8 -; CHECK-NEXT: [[MUL43_I:%.*]] = fmul double [[TEMP2]], [[TEMP9]] -; CHECK-NEXT: [[ADD44_I:%.*]] = fadd double [[MUL38_I]], [[MUL43_I]] ; CHECK-NEXT: [[ARRAYIDX47_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 0 ; CHECK-NEXT: [[TEMP10:%.*]] = load double, ptr [[ARRAYIDX47_I]], align 8 -; CHECK-NEXT: [[MUL50_I:%.*]] = fmul double [[TEMP1]], [[TEMP10]] ; CHECK-NEXT: [[ARRAYIDX52_I:%.*]] = getelementptr inbounds [2 x double], ptr [[A]], i64 1, i64 1 ; CHECK-NEXT: [[TEMP11:%.*]] = load double, ptr [[ARRAYIDX52_I]], align 8 -; CHECK-NEXT: [[MUL55_I:%.*]] = fmul double [[TEMP3]], [[TEMP11]] -; CHECK-NEXT: [[ADD56_I:%.*]] = fadd double [[MUL50_I]], [[MUL55_I]] -; CHECK-NEXT: [[MUL62_I:%.*]] = fmul double [[TEMP4]], [[TEMP10]] -; CHECK-NEXT: [[MUL67_I:%.*]] = fmul double [[TEMP5]], [[TEMP11]] -; CHECK-NEXT: [[ADD68_I:%.*]] = fadd double [[MUL62_I]], [[MUL67_I]] -; CHECK-NEXT: [[MUL74_I:%.*]] = fmul double [[TEMP6]], [[TEMP10]] -; CHECK-NEXT: [[MUL79_I:%.*]] = fmul double [[TEMP7]], [[TEMP11]] -; CHECK-NEXT: [[ADD80_I:%.*]] = fadd double [[MUL74_I]], [[MUL79_I]] -; CHECK-NEXT: [[MUL86_I:%.*]] = fmul double [[TEMP8]], [[TEMP10]] -; CHECK-NEXT: [[MUL91_I:%.*]] = fmul double [[TEMP9]], [[TEMP11]] -; CHECK-NEXT: [[ADD92_I:%.*]] = fadd double [[MUL86_I]], [[MUL91_I]] -; CHECK-NEXT: store double [[ADD_I]], ptr [[OUT:%.*]], align 8 -; CHECK-NEXT: [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 1 -; CHECK-NEXT: store double [[ADD20_I]], ptr [[RES_I_SROA_4_0_OUT2_I_SROA_IDX2]], align 8 -; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 2 -; CHECK-NEXT: store double [[ADD32_I]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8 -; CHECK-NEXT: [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 3 -; CHECK-NEXT: store double [[ADD44_I]], ptr [[RES_I_SROA_6_0_OUT2_I_SROA_IDX6]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TEMP]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr [[ARRAYIDX7_I]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TEMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP5]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4:%.*]] = getelementptr inbounds double, ptr [[OUT:%.*]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load <2 x double>, ptr [[ARRAYIDX25_I]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <2 x double> [[TMP3]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = load <2 x double>, ptr [[ARRAYIDX30_I]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = fmul <2 x double> [[TMP7]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP11]], [[TMP13]] +; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[OUT]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP14]], ptr [[RES_I_SROA_5_0_OUT2_I_SROA_IDX4]], align 8 ; CHECK-NEXT: [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 4 -; CHECK-NEXT: store double [[ADD56_I]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8 -; CHECK-NEXT: [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 5 -; CHECK-NEXT: store double [[ADD68_I]], ptr [[RES_I_SROA_8_0_OUT2_I_SROA_IDX10]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TEMP10]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x double> [[TMP15]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = fmul <2 x double> [[TMP1]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TEMP11]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[TMP18]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP5]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP17]], [[TMP20]] +; CHECK-NEXT: store <2 x double> [[TMP21]], ptr [[RES_I_SROA_7_0_OUT2_I_SROA_IDX8]], align 8 ; CHECK-NEXT: [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 6 -; CHECK-NEXT: store double [[ADD80_I]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8 -; CHECK-NEXT: [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14:%.*]] = getelementptr inbounds double, ptr [[OUT]], i64 7 -; CHECK-NEXT: store double [[ADD92_I]], ptr [[RES_I_SROA_10_0_OUT2_I_SROA_IDX14]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP10]], [[TMP16]] +; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[TMP24:%.*]] = fadd <2 x double> [[TMP22]], [[TMP23]] +; CHECK-NEXT: store <2 x double> [[TMP24]], ptr [[RES_I_SROA_9_0_OUT2_I_SROA_IDX12]], align 8 ; CHECK-NEXT: ret void ; %temp = load double, ptr %A, align 8 Index: llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -654,18 +654,17 @@ define void @single_membound(ptr %arg, ptr %arg1, double %x) { ; CHECK-LABEL: @single_membound( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 +; CHECK-NEXT: [[TMP:%.*]] = fsub double [[X:%.*]], 9.900000e+01 ; CHECK-NEXT: store double [[TMP]], ptr [[TMP9]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = load double, ptr [[ARG1:%.*]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = fsub double 1.000000e+00, [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 2 ; CHECK-NEXT: br label [[BB15:%.*]] ; CHECK: bb15: -; CHECK-NEXT: [[TMP16:%.*]] = fmul double [[TMP]], 2.000000e+01 -; CHECK-NEXT: store double [[TMP16]], ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP17:%.*]] = fmul double [[TMP13]], 3.000000e+01 -; CHECK-NEXT: store double [[TMP17]], ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[TMP9]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -1232,27 +1231,25 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[T19:%.*]] = load ptr, ptr [[ARG:%.*]], align 8 ; CHECK-NEXT: [[T20:%.*]] = load float, ptr [[ARG_3:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> , float [[T20]], i32 1 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB22:%.*]], label [[BB30:%.*]] ; CHECK: bb22: ; CHECK-NEXT: [[T23:%.*]] = fmul float [[T20]], 9.900000e+01 -; CHECK-NEXT: [[T24:%.*]] = fmul float [[T23]], 9.900000e+01 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds float, ptr [[T19]], i64 2 -; CHECK-NEXT: [[T26:%.*]] = fmul float [[T23]], 1.000000e+01 -; CHECK-NEXT: store float [[T26]], ptr [[T25]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[T23]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x float> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 +; CHECK-NEXT: store float [[TMP4]], ptr [[T25]], align 4 ; CHECK-NEXT: [[T27:%.*]] = load float, ptr [[ARG_2:%.*]], align 8 -; CHECK-NEXT: [[T28:%.*]] = fadd float [[T24]], 2.000000e+01 -; CHECK-NEXT: [[T29:%.*]] = fadd float [[T26]], 2.000000e+01 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> [[TMP3]], ; CHECK-NEXT: br label [[BB30]] ; CHECK: bb30: -; CHECK-NEXT: [[T31:%.*]] = phi float [ [[T28]], [[BB22]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[T32:%.*]] = phi float [ [[T29]], [[BB22]] ], [ [[T20]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[BB22]] ], [ [[TMP0]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[BB36:%.*]] ; CHECK: bb36: -; CHECK-NEXT: [[T37:%.*]] = fmul float [[T31]], 3.000000e+00 -; CHECK-NEXT: store float [[T37]], ptr [[ARG_3]], align 4 -; CHECK-NEXT: [[T39:%.*]] = fmul float [[T32]], 3.000000e+00 -; CHECK-NEXT: [[T40:%.*]] = getelementptr inbounds float, ptr [[ARG_3]], i64 1 -; CHECK-NEXT: store float [[T39]], ptr [[T40]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], +; CHECK-NEXT: store <2 x float> [[TMP7]], ptr [[ARG_3]], align 4 ; CHECK-NEXT: br label [[BB41:%.*]] ; CHECK: bb41: ; CHECK-NEXT: ret void Index: llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,389 +14,168 @@ ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] -; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] -; CHECK-NEXT: [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] -; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] -; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 -; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] -; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] -; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] -; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] -; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] -; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] -; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] -; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 ; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2 -; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] -; CHECK-NEXT: [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]] -; CHECK-NEXT: [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]] -; CHECK-NEXT: [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2 -; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] -; CHECK-NEXT: [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]] -; CHECK-NEXT: [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]] -; CHECK-NEXT: [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2 -; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] -; CHECK-NEXT: [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]] -; CHECK-NEXT: [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]] -; CHECK-NEXT: [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2 -; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] -; CHECK-NEXT: [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]] -; CHECK-NEXT: [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]] -; CHECK-NEXT: [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2 -; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] -; CHECK-NEXT: [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]] -; CHECK-NEXT: [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]] -; CHECK-NEXT: [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5 -; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2 -; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] -; CHECK-NEXT: [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]] -; CHECK-NEXT: [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]] -; CHECK-NEXT: [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6 -; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2 -; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] -; CHECK-NEXT: [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]] -; CHECK-NEXT: [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]] -; CHECK-NEXT: [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7 -; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2 -; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] -; CHECK-NEXT: [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]] -; CHECK-NEXT: [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]] +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2 -; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] -; CHECK-NEXT: [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]] -; CHECK-NEXT: [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]] -; CHECK-NEXT: [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1 -; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2 -; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] -; CHECK-NEXT: [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]] -; CHECK-NEXT: [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]] -; CHECK-NEXT: [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2 -; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2 -; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] -; CHECK-NEXT: [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]] -; CHECK-NEXT: [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]] -; CHECK-NEXT: [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3 -; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2 -; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] -; CHECK-NEXT: [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]] -; CHECK-NEXT: [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]] -; CHECK-NEXT: [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2 -; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] -; CHECK-NEXT: [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]] -; CHECK-NEXT: [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]] -; CHECK-NEXT: [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5 -; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2 -; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] -; CHECK-NEXT: [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]] -; CHECK-NEXT: [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]] -; CHECK-NEXT: [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6 -; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2 -; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] -; CHECK-NEXT: [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]] -; CHECK-NEXT: [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]] -; CHECK-NEXT: [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7 -; CHECK-NEXT: [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2 -; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] -; CHECK-NEXT: [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]] -; CHECK-NEXT: [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]] +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2 -; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] -; CHECK-NEXT: [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]] -; CHECK-NEXT: [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]] -; CHECK-NEXT: [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2 -; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] -; CHECK-NEXT: [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]] -; CHECK-NEXT: [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]] -; CHECK-NEXT: [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2 -; CHECK-NEXT: [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2 -; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] -; CHECK-NEXT: [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]] -; CHECK-NEXT: [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]] -; CHECK-NEXT: [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3 -; CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2 -; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] -; CHECK-NEXT: [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]] -; CHECK-NEXT: [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]] -; CHECK-NEXT: [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2 -; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] -; CHECK-NEXT: [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]] -; CHECK-NEXT: [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]] -; CHECK-NEXT: [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5 -; CHECK-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2 -; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] -; CHECK-NEXT: [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]] -; CHECK-NEXT: [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]] -; CHECK-NEXT: [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6 -; CHECK-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2 -; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] -; CHECK-NEXT: [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]] -; CHECK-NEXT: [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]] -; CHECK-NEXT: [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7 -; CHECK-NEXT: [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2 -; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] -; CHECK-NEXT: [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]] -; CHECK-NEXT: [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]] +; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2 -; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] -; CHECK-NEXT: [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]] -; CHECK-NEXT: [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]] -; CHECK-NEXT: [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1 -; CHECK-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2 -; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] -; CHECK-NEXT: [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]] -; CHECK-NEXT: [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]] -; CHECK-NEXT: [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2 -; CHECK-NEXT: [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2 -; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] -; CHECK-NEXT: [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]] -; CHECK-NEXT: [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]] -; CHECK-NEXT: [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3 -; CHECK-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2 -; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] -; CHECK-NEXT: [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]] -; CHECK-NEXT: [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]] -; CHECK-NEXT: [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4 -; CHECK-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2 -; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] -; CHECK-NEXT: [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]] -; CHECK-NEXT: [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]] -; CHECK-NEXT: [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5 -; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2 -; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] -; CHECK-NEXT: [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]] -; CHECK-NEXT: [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]] -; CHECK-NEXT: [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6 -; CHECK-NEXT: [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2 -; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] -; CHECK-NEXT: [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]] -; CHECK-NEXT: [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]] -; CHECK-NEXT: [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7 -; CHECK-NEXT: [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2 -; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] -; CHECK-NEXT: [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]] -; CHECK-NEXT: [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]] +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2 -; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] -; CHECK-NEXT: [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]] -; CHECK-NEXT: [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]] -; CHECK-NEXT: [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1 -; CHECK-NEXT: [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2 -; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] -; CHECK-NEXT: [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]] -; CHECK-NEXT: [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]] -; CHECK-NEXT: [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2 -; CHECK-NEXT: [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2 -; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] -; CHECK-NEXT: [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]] -; CHECK-NEXT: [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]] -; CHECK-NEXT: [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3 -; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2 -; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] -; CHECK-NEXT: [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]] -; CHECK-NEXT: [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]] -; CHECK-NEXT: [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4 -; CHECK-NEXT: [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2 -; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] -; CHECK-NEXT: [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]] -; CHECK-NEXT: [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]] -; CHECK-NEXT: [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5 -; CHECK-NEXT: [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2 -; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] -; CHECK-NEXT: [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]] -; CHECK-NEXT: [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]] -; CHECK-NEXT: [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6 -; CHECK-NEXT: [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2 -; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] -; CHECK-NEXT: [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]] -; CHECK-NEXT: [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]] -; CHECK-NEXT: [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7 -; CHECK-NEXT: [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2 -; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] -; CHECK-NEXT: [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]] -; CHECK-NEXT: [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]] +; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2 -; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] -; CHECK-NEXT: [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]] -; CHECK-NEXT: [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]] -; CHECK-NEXT: [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1 -; CHECK-NEXT: [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2 -; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] -; CHECK-NEXT: [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]] -; CHECK-NEXT: [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]] -; CHECK-NEXT: [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2 -; CHECK-NEXT: [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2 -; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] -; CHECK-NEXT: [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]] -; CHECK-NEXT: [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]] -; CHECK-NEXT: [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3 -; CHECK-NEXT: [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2 -; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] -; CHECK-NEXT: [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]] -; CHECK-NEXT: [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]] -; CHECK-NEXT: [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4 -; CHECK-NEXT: [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2 -; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] -; CHECK-NEXT: [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]] -; CHECK-NEXT: [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]] -; CHECK-NEXT: [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5 -; CHECK-NEXT: [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2 -; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] -; CHECK-NEXT: [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]] -; CHECK-NEXT: [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]] -; CHECK-NEXT: [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6 -; CHECK-NEXT: [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2 -; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] -; CHECK-NEXT: [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]] -; CHECK-NEXT: [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]] -; CHECK-NEXT: [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7 -; CHECK-NEXT: [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2 -; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] -; CHECK-NEXT: [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]] -; CHECK-NEXT: [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]] +; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32 -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] -; CHECK-NEXT: [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]] -; CHECK-NEXT: [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]] -; CHECK-NEXT: [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1 -; CHECK-NEXT: [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2 -; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32 -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] -; CHECK-NEXT: [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]] -; CHECK-NEXT: [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]] -; CHECK-NEXT: [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2 -; CHECK-NEXT: [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2 -; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32 -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] -; CHECK-NEXT: [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]] -; CHECK-NEXT: [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]] -; CHECK-NEXT: [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3 -; CHECK-NEXT: [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2 -; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32 -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] -; CHECK-NEXT: [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]] -; CHECK-NEXT: [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]] -; CHECK-NEXT: [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4 -; CHECK-NEXT: [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2 -; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32 -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] -; CHECK-NEXT: [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]] -; CHECK-NEXT: [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]] -; CHECK-NEXT: [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5 -; CHECK-NEXT: [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2 -; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32 -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] -; CHECK-NEXT: [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]] -; CHECK-NEXT: [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]] -; CHECK-NEXT: [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6 -; CHECK-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2 -; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32 -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] -; CHECK-NEXT: [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]] -; CHECK-NEXT: [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]] -; CHECK-NEXT: [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7 -; CHECK-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2 -; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32 -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] -; CHECK-NEXT: [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]] -; CHECK-NEXT: [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]] +; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i16> [[TMP8]], <64 x i16> [[TMP9]], <64 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP15]], <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i16> [[TMP16]], <64 x i16> [[TMP17]], <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <64 x i16> [[TMP18]], <64 x i16> [[TMP19]], <64 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <64 x i16> [[TMP20]], <64 x i16> [[TMP21]], <64 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = zext <64 x i16> [[TMP22]] to <64 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP23]], i32 0 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP23]], i32 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = mul nuw nsw <64 x i32> [[TMP23]], [[TMP23]] +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP23]], i32 2 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP23]], i32 3 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP28]] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP23]], i32 4 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP29]] +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP23]], i32 5 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP30]] +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP23]], i32 6 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP23]], i32 7 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP32]] +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP23]], i32 8 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP33]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP23]], i32 9 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP34]] +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP23]], i32 10 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP35]] +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP23]], i32 11 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP36]] +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP23]], i32 12 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP37]] +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP23]], i32 13 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP23]], i32 14 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP39]] +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP23]], i32 15 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP23]], i32 16 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP41]] +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP23]], i32 17 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP42]] +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP23]], i32 18 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP43]] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP23]], i32 19 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP44]] +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP23]], i32 20 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP45]] +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP23]], i32 21 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP46]] +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP23]], i32 22 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP47]] +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP23]], i32 23 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP48]] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP23]], i32 24 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP49]] +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP23]], i32 25 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP50]] +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP23]], i32 26 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP51]] +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP23]], i32 27 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP52]] +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP23]], i32 28 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP53]] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP23]], i32 29 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP54]] +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP23]], i32 30 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP55]] +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP23]], i32 31 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP56]] +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP23]], i32 32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP23]], i32 33 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP58]] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP23]], i32 34 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP59]] +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP23]], i32 35 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP60]] +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP23]], i32 36 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP61]] +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP23]], i32 37 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP62]] +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP23]], i32 38 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP63]] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP23]], i32 39 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP64]] +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP23]], i32 40 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP65]] +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP23]], i32 41 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP66]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP23]], i32 42 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP67]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP23]], i32 43 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP68]] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP23]], i32 44 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP69]] +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP23]], i32 45 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP70]] +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP23]], i32 46 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP71]] +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP23]], i32 47 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP72]] +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP23]], i32 48 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP73]] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP23]], i32 49 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP74]] +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP23]], i32 50 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP75]] +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP23]], i32 51 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP76]] +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP23]], i32 52 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP77]] +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP23]], i32 53 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP78]] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP23]], i32 54 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP79]] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP23]], i32 55 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP80]] +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP23]], i32 56 +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <64 x i32> [[TMP23]], i32 57 +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP82]] +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <64 x i32> [[TMP23]], i32 58 +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP83]] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <64 x i32> [[TMP23]], i32 59 +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP84]] +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <64 x i32> [[TMP23]], i32 60 +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP85]] +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <64 x i32> [[TMP23]], i32 61 +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP86]] +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <64 x i32> [[TMP23]], i32 62 +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP87]] +; CHECK-NEXT: [[TMP88:%.*]] = extractelement <64 x i32> [[TMP23]], i32 63 +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP88]] +; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP26]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] @@ -798,111 +577,23 @@ ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader: ; CHECK-NEXT: [[Y_038:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC13:%.*]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD11_15:%.*]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD_15:%.*]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2 -; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]] -; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]] -; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] -; CHECK-NEXT: [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 -; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]] -; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] -; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 -; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]] -; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] -; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 -; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 -; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]] -; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] -; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 -; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]] -; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] -; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 -; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]] -; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] -; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 -; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]] -; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] -; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] -; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2 -; CHECK-NEXT: [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32 -; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]] -; CHECK-NEXT: [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]] -; CHECK-NEXT: [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]] -; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9 -; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2 -; CHECK-NEXT: [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32 -; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]] -; CHECK-NEXT: [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]] -; CHECK-NEXT: [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]] -; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10 -; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2 -; CHECK-NEXT: [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32 -; CHECK-NEXT: [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]] -; CHECK-NEXT: [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]] -; CHECK-NEXT: [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]] -; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11 -; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2 -; CHECK-NEXT: [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32 -; CHECK-NEXT: [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]] -; CHECK-NEXT: [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]] -; CHECK-NEXT: [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]] -; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12 -; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2 -; CHECK-NEXT: [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32 -; CHECK-NEXT: [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]] -; CHECK-NEXT: [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]] -; CHECK-NEXT: [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]] -; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13 -; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2 -; CHECK-NEXT: [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32 -; CHECK-NEXT: [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]] -; CHECK-NEXT: [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]] -; CHECK-NEXT: [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]] -; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14 -; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2 -; CHECK-NEXT: [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32 -; CHECK-NEXT: [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]] -; CHECK-NEXT: [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]] -; CHECK-NEXT: [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]] -; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15 -; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2 -; CHECK-NEXT: [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32 -; CHECK-NEXT: [[ADD_15]] = add i32 [[ADD_14]], [[CONV_15]] -; CHECK-NEXT: [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]] -; CHECK-NEXT: [[ADD11_15]] = add i32 [[MUL_15]], [[ADD11_14]] +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]] ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[INC13]] = add nuw nsw i32 [[Y_038]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16 ; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_15]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[ADD11_15]] to i64 +; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[OP_RDX1]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] Index: llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll =================================================================== --- llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll +++ llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll @@ -206,22 +206,22 @@ define float @slp_not_profitable_in_loop(float %x, ptr %A) { ; CHECK-LABEL: @slp_not_profitable_in_loop( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 1 -; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[GEP_A_1]], align 4 -; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 2 ; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_A_2]], align 4 -; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 ; CHECK-NEXT: [[L_3:%.*]] = load float, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[X:%.*]], i32 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[MUL11:%.*]] = fmul fast float 3.000000e+00, [[L_0]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[MUL12:%.*]] = fmul fast float 3.000000e+00, [[L_1]] -; CHECK-NEXT: [[MUL14:%.*]] = fmul fast float [[X:%.*]], [[L_2]] ; CHECK-NEXT: [[MUL16:%.*]] = fmul fast float 3.000000e+00, [[L_3]] -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]] -; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[MUL14]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[MUL12]], [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[ADD13:%.*]] = fadd fast float [[ADD]], [[TMP4]] ; CHECK-NEXT: [[RED_NEXT]] = fadd fast float [[ADD13]], [[MUL16]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV]], 10 Index: llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll =================================================================== --- llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll +++ llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll @@ -505,9 +505,9 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx( ; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]] -; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0 +; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1 +; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ; %lv = load <4 x i32>, ptr %x @@ -521,10 +521,9 @@ ; because the vector large vector requires 2 vector registers. define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable( -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X:%.*]], i32 0, i32 0 -; CHECK-NEXT: [[E_0:%.*]] = load i32, ptr [[TMP1]], align 16 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[X]], i32 0, i32 6 -; CHECK-NEXT: [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16 +; CHECK-NEXT: [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0 +; CHECK-NEXT: [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] ; CHECK-NEXT: ret i32 [[RES]] ;